Repository: agenium-scale/nsimd
Branch: master
Commit: 702f4d179ff0
Files: 148
Total size: 2.7 MB

Directory structure:
gitextract_56lzr4bw/

├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benches/
│   └── benches.hpp
├── build.nsconfig
├── doc/
│   ├── Makefile.nix
│   ├── Makefile.win
│   ├── markdown/
│   │   ├── compilers_and_versions.md
│   │   ├── concepts.md
│   │   ├── defines.md
│   │   ├── faq.md
│   │   ├── fp16.md
│   │   ├── how_tests_are_done.md
│   │   ├── memory.md
│   │   ├── modules/
│   │   │   ├── .gitignore
│   │   │   └── fixed_point/
│   │   │       └── overview.md
│   │   ├── pack.md
│   │   └── tutorial.md
│   ├── md2html.cpp
│   └── what_is_wrapped.cpp
├── egg/
│   ├── __init__.py
│   ├── common.py
│   ├── cuda.py
│   ├── experiments/
│   │   ├── gen_sleef_operators.py
│   │   ├── round-ppc.c
│   │   └── upcvt-sve.c
│   ├── gen_adv_c_api.py
│   ├── gen_adv_cxx_api.py
│   ├── gen_archis.py
│   ├── gen_base_apis.py
│   ├── gen_benches.py
│   ├── gen_doc.py
│   ├── gen_friendly_but_not_optimized.py
│   ├── gen_modules.py
│   ├── gen_scalar_utilities.py
│   ├── gen_src.py
│   ├── gen_tests.py
│   ├── get_sleef_code.py
│   ├── hatch.py
│   ├── modules/
│   │   ├── fixed_point/
│   │   │   ├── gen_doc.py
│   │   │   ├── gen_tests.py
│   │   │   └── hatch.py
│   │   ├── memory_management/
│   │   │   └── hatch.py
│   │   ├── random/
│   │   │   └── hatch.py
│   │   ├── spmd/
│   │   │   └── hatch.py
│   │   └── tet1d/
│   │       └── hatch.py
│   ├── oneapi.py
│   ├── operators.py
│   ├── platform_arm.py
│   ├── platform_cpu.py
│   ├── platform_ppc.py
│   ├── platform_x86.py
│   ├── rocm.py
│   ├── scalar.py
│   └── x86_load_store_deg234.py
├── examples/
│   ├── module_fixed_point.cpp
│   └── tutorial.cpp
├── include/
│   └── nsimd/
│       ├── c_adv_api.h
│       ├── cxx_adv_api.hpp
│       ├── cxx_adv_api_aliases.hpp
│       ├── modules/
│       │   ├── fixed_point.hpp
│       │   ├── memory_management.hpp
│       │   ├── spmd.hpp
│       │   └── tet1d.hpp
│       ├── nsimd-all.h
│       ├── nsimd-all.hpp
│       └── nsimd.h
├── scripts/
│   ├── FindNSIMD.cmake
│   ├── aarch64-linux-gnu-clang++.sh
│   ├── aarch64-linux-gnu-clang.sh
│   ├── build-tests.bat
│   ├── build-tests.sh
│   ├── build.bat
│   ├── build.sh
│   ├── ci-clang.txt
│   ├── ci-scale.txt
│   ├── ci-test.txt
│   ├── ci.sh
│   ├── compile-gmp-mpfr-for-wasm.sh
│   ├── gen_github_doc.sh
│   ├── hipcc.sh
│   ├── init-benches-deps.sh
│   ├── local-ci-rerun.ini
│   ├── local-ci.ini
│   ├── local-ci.sh
│   ├── one-liner.c
│   ├── powerpc64le-linux-gnu-clang++.sh
│   ├── powerpc64le-linux-gnu-clang.sh
│   ├── setup.bat
│   └── setup.sh
├── src/
│   ├── dd.h
│   ├── df.h
│   ├── estrin.h
│   ├── fp16.cpp
│   ├── gpu.cpp
│   ├── helperadvsimd.h
│   ├── helperavx.h
│   ├── helperavx2.h
│   ├── helperavx512f.h
│   ├── helperneon32.h
│   ├── helperpower_128.h
│   ├── helpersse2.h
│   ├── helpersve.h
│   ├── memory.cpp
│   ├── misc.h
│   ├── rempitab.c
│   ├── rename.h
│   ├── renameadvsimd.h
│   ├── renameavx.h
│   ├── renameavx2.h
│   ├── renameavx512f.h
│   ├── renameneon32.h
│   ├── renamesse2.h
│   ├── renamesse4.h
│   ├── renamesve.h
│   ├── renamevsx.h
│   ├── sleefdp.c
│   ├── sleefsimddp.c
│   ├── sleefsimddp_emulation.c
│   ├── sleefsimdsp.c
│   ├── sleefsimdsp_emulation.c
│   ├── sleefsp.c
│   └── ufp.cpp
└── tests/
    ├── CMakeLists.txt.sh
    ├── FindNSIMD.cmake.sh
    ├── allocator.cpp
    ├── assign_arith.cpp
    ├── booleans.cpp
    ├── c11_vec.c
    ├── cxx_adv_api_aliases.cpp
    ├── fp16.prec11.c
    ├── get_pack.cpp
    ├── memory.cpp
    ├── memory.prec11.c
    ├── modules/
    │   └── common.hpp
    ├── nsimd-all.cpp
    ├── nsimd.cpp
    ├── nsimd.prec11.c
    ├── operator_vector_scalar.cpp
    ├── shifts.cpp
    ├── templated_loads_stores.cpp
    ├── tests_helpers.hpp
    ├── to_pack.cpp
    ├── to_pack_interleave.cpp
    └── ufp.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
Standard: Cpp03
ColumnLimit: 79


================================================
FILE: .gitignore
================================================
# Common build dirs
build*/

# Dependencies
nstools/

# Binaries
*.o
*.so
*.pyc
*.exe
*.dll
*.dylib

# Generated files
## API
src/api_*.cpp
src/api_*

## Plateform specific code
include/nsimd/arm
include/nsimd/cpu
include/nsimd/cxx_adv_api_functions.hpp
include/nsimd/friendly_but_not_optimized.hpp
include/nsimd/functions.h
include/nsimd/ppc
include/nsimd/x86

## Tests
tests/c_base
tests/cxx_base
tests/cxx_adv
tests/modules/tet1d/
tests/modules/fixed_point/
tests/modules/rand/*.cpp
tests/modules/spmd/
tests/modules/random/

## Benches
benches/cxx_adv

## Modules
include/nsimd/modules/tet1d/
include/nsimd/modules/spmd/
include/nsimd/modules/fixed_point/
include/nsimd/scalar_utilities.h

## Doc
doc/html/*
!doc/html/assets/
doc/markdown/overview.md
doc/markdown/api.md
doc/markdown/api_*.md
doc/markdown/module_fixed_point_api*.md
doc/markdown/module_fixed_point_overview.md
doc/markdown/module_spmd_api*.md
doc/markdown/module_spmd_overview.md
doc/markdown/module_memory_management_overview.md
doc/md2html
doc/tmp.html

## Ulps
ulps/

## CI
_ci/


================================================
FILE: CMakeLists.txt
================================================
# MIT License
#
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

cmake_minimum_required(VERSION 3.0.2)
project(NSIMD VERSION 3.0 LANGUAGES C CXX)

# -----------------------------------------------------------------------------
# First check that NSIMD code has been generated

if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd/functions.h")
  if (WIN32)
    execute_process(COMMAND
                    python ${CMAKE_CURRENT_SOURCE_DIR}\\egg\\hatch.py -lf)
  else()
    execute_process(COMMAND
                    python3 ${CMAKE_CURRENT_SOURCE_DIR}/egg/hatch.py -lf)
  endif()
endif()

# -----------------------------------------------------------------------------
# Compilations options

option(NSIMD_ARM32_IS_ARMEL "Set whether ARM32 is in fact armel or armhf" ON)

function(nsimd_get_compiler_argument simd_ext argument)
  if (MSVC)
    if (CMAKE_CL_64)
      set(mapping_sse2 "/DSSE2")
      set(mapping_sse42 "/DSSE42")
    else()  
      set(mapping_sse2 "/DSSE2;/arch:SSE2")
      set(mapping_sse42 "/DSSE42;/arch:SSE2")
    endif()
    set(mapping_avx "/DAVX;/arch:AVX")
    set(mapping_avx2 "/DAVX2;/arch:AVX2")
    set(mapping_avx512_knl "/DAVX512_KNL;/arch:AVX512")
    set(mapping_avx512_skylake "/DAVX512_SKYLAKE;/arch:AVX512")
    set(mapping_neon128 "/DNEON128;/arch:VFPv4")
    set(mapping_aarch64 "/DAARCH64")
    set(mapping_sve "/DSVE")
    set(mapping_sve128 "/DSVE128")
    set(mapping_sve256 "/DSVE256")
    set(mapping_sve512 "/DSVE512")
    set(mapping_sve1024 "/DSVE1024")
    set(mapping_sve2048 "/DSVE2048")
    set(mapping_vmx "/DVMX")
    set(mapping_vsx "/DVSX")
    set(mapping_cuda "/DCUDA")
    set(mapping_rocm "/DROCM")
    set(mapping_oneapi "/ONEAPI")
  else()
    set(mapping_sse2 "-DSSE2;-msse2" )
    set(mapping_sse42 "-DSSE42;-msse4.2" )
    set(mapping_avx "-DAVX;-mavx;-mno-avx256-split-unaligned-load"
                    ";-mno-avx256-split-unaligned-store" )
    set(mapping_avx2 "-DAVX2;-mavx2;-mfma;-mno-avx256-split-unaligned-load"
                     ";-mno-avx256-split-unaligned-store" )
    set(mapping_avx512_knl "-DAVX512_KNL;-mavx512f;-mavx512pf;-mavx512er"
                           ";-mavx512cd")
    set(mapping_avx512_skylake "-DAVX512_SKYLAKE;-mavx512f;-mavx512dq"
                               ";-mavx512cd;-mavx512bw;-mavx512vl")
    if (NSIMD_ARM32_IS_ARMEL)
      set(mapping_neon128 "-DNEON128;-mfloat-abi=softfp;-mfpu=neon")
    else()
      set(mapping_neon128 "-DNEON128;-mfpu=neon")
    endif()
    set(mapping_aarch64 "-DAARCH64")
    set(mapping_sve "-DSVE;-march=armv8.2-a+sve")
    set(mapping_sve128 "-DSVE128;-march=armv8.2-a+sve;-msve-vector-bits=128")
    set(mapping_sve256 "-DSVE256;-march=armv8.2-a+sve;-msve-vector-bits=256")
    set(mapping_sve512 "-DSVE512;-march=armv8.2-a+sve;-msve-vector-bits=512")
    set(mapping_sve1024 "-DSVE1024;-march=armv8.2-a+sve"
                        ";-msve-vector-bits=1024")
    set(mapping_sve2048 "-DSVE2048 -march=armv8.2-a+sve"
                        ";-msve-vector-bits=2048")
    set(mapping_vmx "-DVMX;-mcpu=powerpc64le;-maltivec")
    set(mapping_vsx "-DVSX;-mcpu=powerpc64le;-mvsx")
    set(mapping_cuda "-DCUDA")
    set(mapping_rocm "-DROCM")
    set(mapping_oneapi "-DONEAPI")
  endif()
  if (DEFINED mapping_${simd_ext})
    set(${argument} "${mapping_${simd_ext}}" PARENT_SCOPE)
  else()
    if (MSVC)
      set(${argument} "/DCPU" PARENT_SCOPE)
    else()
      set(${argument} "-DCPU" PARENT_SCOPE)
    endif()
  endif()
endfunction()

if (NOT DEFINED simd)
  set(simd "cpu")
endif()
nsimd_get_compiler_argument(${simd} NSIMD_COMPILATION_OPTIONS)

# -----------------------------------------------------------------------------
# Object file selection

set(NSIMD_OBJS "fp16;gpu;memory;api_cpu;rempitab;sleefsp;sleefdp")

if ("${simd}" STREQUAL "sse2")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;sleef_sse2_f32;sleef_sse2_f64")
elseif ("${simd}" STREQUAL "sse42")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;"
                 "sleef_sse2_f32;sleef_sse2_f64;"
                 "sleef_sse42_f32;sleef_sse42_f64")
elseif ("${simd}" STREQUAL "avx")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;"
                 "sleef_sse2_f32;sleef_sse2_f64;"
                 "sleef_sse42_f32;sleef_sse42_f64;"
                 "sleef_avx_f32;sleef_avx_f64")
elseif ("${simd}" STREQUAL "avx2")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;"
                 "sleef_sse2_f32;sleef_sse2_f64;"
                 "sleef_sse42_f32;sleef_sse42_f64;"
                 "sleef_avx_f32;sleef_avx_f64;"
                 "sleef_avx2_f32;sleef_avx2_f64")
elseif ("${simd}" STREQUAL "avx512_knl")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2"
                 "sleef_sse2_f32;sleef_sse2_f64;"
                 "sleef_sse42_f32;sleef_sse42_f64;"
                 "sleef_avx_f32;sleef_avx_f64;"
                 "sleef_avx2_f32;sleef_avx2_f64;"
                 "api_avx512_knl;sleef_avx512_knl_f32;sleef_avx512_knl_f64")
elseif ("${simd}" STREQUAL "avx512_skylake")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;"
                 "api_avx512_skylake;sleef_avx512_skylake_f32;"
                 "sleef_sse2_f32;sleef_sse2_f64;"
                 "sleef_sse42_f32;sleef_sse42_f64;"
                 "sleef_avx_f32;sleef_avx_f64;"
                 "sleef_avx2_f32;sleef_avx2_f64;"
                 "sleef_avx512_skylake_f64")
elseif ("${simd}" STREQUAL "neon128")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_neon128;"
                 "sleef_neon128_f32;sleef_neon128_f64")
elseif ("${simd}" STREQUAL "aarch64")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;"
                 "sleef_aarch64_f32;sleef_aarch64_f64")
elseif ("${simd}" STREQUAL "sve")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve;"
                 "sleef_aarch64_f32;sleef_aarch64_f64;"
                 "sleef_sve_f32;sleef_sve_f64")
elseif ("${simd}" STREQUAL "sve128")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve128;"
                 "sleef_aarch64_f32;sleef_aarch64_f64;"
                 "sleef_sve128_f32;sleef_sve128_f64")
elseif ("${simd}" STREQUAL "sve256")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve256;"
                 "sleef_aarch64_f32;sleef_aarch64_f64;"
                 "sleef_sve256_f32;sleef_sve256_f64")
elseif ("${simd}" STREQUAL "sve512")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve512;"
                 "sleef_aarch64_f32;sleef_aarch64_f64;"
                 "sleef_sve512_f32;sleef_sve512_f64")
elseif ("${simd}" STREQUAL "sve1024")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve1024;"
                 "sleef_aarch64_f32;sleef_aarch64_f64;"
                 "sleef_sve1024_f32;sleef_sve1024_f64")
elseif ("${simd}" STREQUAL "sve2048")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve2048;"
                 "sleef_aarch64_f32;sleef_aarch64_f64;"
                 "sleef_sve2048_f32;sleef_sve2048_f64")
elseif ("${simd}" STREQUAL "vmx")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_vmx;sleef_vmx_f32;sleef_vmx_f64")
elseif ("${simd}" STREQUAL "vsx")
  set(NSIMD_OBJS "${NSIMD_OBJS};api_vmx;api_vsx;sleef_vmx_f32;sleef_vmx_f64;"
                 "sleef_vsx_f32;sleef_vsx_f64")
endif()

# -----------------------------------------------------------------------------
# Rules for building the library

set(NSIMD_LIB_DEPS "")
foreach(o ${NSIMD_OBJS})
  if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp")
    add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp")
  elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c")
    add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c")
  elseif(("${o}" STREQUAL "sleef_neon128_f64") OR
         ("${o}" STREQUAL "sleef_vmx_f64"))
    add_library(${o} OBJECT
                "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp_emulation.c")
  elseif("${o}" STREQUAL "sleef_vmx_f32")
    add_library(${o} OBJECT
                "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp_emulation.c")
  elseif(o MATCHES "sleef_.*_f32")
    add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp.c")
  elseif(o MATCHES "sleef_.*_f64")
    add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp.c")
  endif()
  if (MSVC)
    set(sleef_cflags "/DNDEBUG;/DDORENAME=1")
  else()
    set(sleef_cflags "-DNDEBUG;-DDORENAME=1")
  endif()
  set_property(TARGET ${o} PROPERTY POSITION_INDEPENDENT_CODE ON)
  target_include_directories(${o} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
  if (MSVC)
    target_compile_definitions(${o} PUBLIC "/D_CRT_SECURE_NO_WARNINGS")
  endif()
  set(buf "")
  if ("${o}" STREQUAL "api_sse2")
    nsimd_get_compiler_argument("sse2" buf)
  elseif ("${o}" STREQUAL "api_sse42")
    nsimd_get_compiler_argument("sse42" buf)
  elseif ("${o}" STREQUAL "api_avx")
    nsimd_get_compiler_argument("avx" buf)
  elseif ("${o}" STREQUAL "api_avx2")
    nsimd_get_compiler_argument("avx2" buf)
  elseif ("${o}" STREQUAL "api_avx512_knl")
    nsimd_get_compiler_argument("avx512_knl" buf)
  elseif ("${o}" STREQUAL "api_avx512_skylake")
    nsimd_get_compiler_argument("avx512_skylake" buf)
  elseif ("${o}" STREQUAL "api_neon128")
    nsimd_get_compiler_argument("neon128" buf)
  elseif ("${o}" STREQUAL "api_aarch64")
    nsimd_get_compiler_argument("aarch64" buf)
  elseif ("${o}" STREQUAL "api_sve")
    nsimd_get_compiler_argument("sve" buf)
  elseif ("${o}" STREQUAL "api_sve128")
    nsimd_get_compiler_argument("sve128" buf)
  elseif ("${o}" STREQUAL "api_sve256")
    nsimd_get_compiler_argument("sve256" buf)
  elseif ("${o}" STREQUAL "api_sve512")
    nsimd_get_compiler_argument("sve512" buf)
  elseif ("${o}" STREQUAL "api_sve1024")
    nsimd_get_compiler_argument("sve1024" buf)
  elseif ("${o}" STREQUAL "api_sve2048")
    nsimd_get_compiler_argument("sve2048" buf)
  elseif ("${o}" STREQUAL "api_vmx")
    nsimd_get_compiler_argument("vmx" buf)
  elseif ("${o}" STREQUAL "api_vsx")
    nsimd_get_compiler_argument("vsx" buf)
  elseif ("${o}" STREQUAL "api_cuda")
    nsimd_get_compiler_argument("cuda" buf)
  elseif ("${o}" STREQUAL "api_rocm")
    nsimd_get_compiler_argument("rocm" buf)
  elseif ("${o}" STREQUAL "api_cpu")
    nsimd_get_compiler_argument("cpu" buf)
  elseif ("${o}" STREQUAL "rempitab")
    list(APPEND buf "${sleef_cflags}")
  elseif ("${o}" STREQUAL "sleefsp")
    list(APPEND buf "${sleef_cflags}")
  elseif ("${o}" STREQUAL "sleefdp")
    list(APPEND buf "${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sse2_")
    nsimd_get_compiler_argument("sse2" buf)
    list(APPEND buf "-DNSIMD_SSE2;-DENABLE_SSE2=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sse42_")
    nsimd_get_compiler_argument("sse42" buf)
    list(APPEND buf "-DNSIMD_SSE42;-DENABLE_SSE4=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_avx_")
    nsimd_get_compiler_argument("avx" buf)
    list(APPEND buf "-DNSIMD_AVX;-DENABLE_AVX=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_avx2_")
    nsimd_get_compiler_argument("avx2" buf)
    list(APPEND buf "-DNSIMD_AVX2;-DENABLE_AVX2=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_avx512_knl_")
    nsimd_get_compiler_argument("avx512_knl" buf)
    list(APPEND buf "-DNSIMD_AVX512_KNL;-DENABLE_AVX512F=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_avx512_skylake_")
    nsimd_get_compiler_argument("avx512_skylake" buf)
    list(APPEND buf
         "-DNSIMD_AVX512_SKYLAKE;-DENABLE_AVX512F=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_neon128_")
    nsimd_get_compiler_argument("neon128" buf)
    list(APPEND buf "-DNSIMD_NEON128;-DENABLE_NEON32=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_aarch64_")
    nsimd_get_compiler_argument("aarch64" buf)
    list(APPEND buf "-DNSIMD_AARCH64;-DENABLE_ADVSIMD=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sve_")
    nsimd_get_compiler_argument("sve" buf)
    list(APPEND buf "-DNSIMD_SVE;-DENABLE_SVE=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sve128_")
    nsimd_get_compiler_argument("sve128" buf)
    list(APPEND buf "-DNSIMD_SVE128;-DENABLE_SVE=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sve256_")
    nsimd_get_compiler_argument("sve256" buf)
    list(APPEND buf "-DNSIMD_SVE256;-DENABLE_SVE=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sve512_")
    nsimd_get_compiler_argument("sve512" buf)
    list(APPEND buf "-DNSIMD_SVE512;-DENABLE_SVE=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sve1024_")
    nsimd_get_compiler_argument("sve1024" buf)
    list(APPEND buf "-DNSIMD_SVE1024;-DENABLE_SVE=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_sve2048_")
    nsimd_get_compiler_argument("sve2048" buf)
    list(APPEND buf "-DNSIMD_SVE2048;-DENABLE_SVE=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_vmx_")
    nsimd_get_compiler_argument("vmx" buf)
    list(APPEND buf "-DNSIMD_VMX;-DENABLE_VSX=1;${sleef_cflags}")
  elseif ("${o}" MATCHES "sleef_vsx_")
    nsimd_get_compiler_argument("vsx" buf)
    list(APPEND buf "-DNSIMD_VSX;-DENABLE_VSX=1;${sleef_cflags}")
  else()
    set(buf "")
  endif()
  if (NOT "${buf}" STREQUAL "")
    target_compile_options(${o} PUBLIC "${buf}")
  endif()
  list(APPEND NSIMD_LIB_DEPS "$<TARGET_OBJECTS:${o}>")
endforeach()

set(NSIMD_LIB_TARGET "nsimd_${simd}")
add_library(${NSIMD_LIB_TARGET} SHARED ${NSIMD_LIB_DEPS})

# -----------------------------------------------------------------------------
# Installation stuff

if (WIN32)
  install(TARGETS ${NSIMD_LIB_TARGET} RUNTIME DESTINATION lib
                                      ARCHIVE DESTINATION lib)
else()
  install(TARGETS ${NSIMD_LIB_TARGET} LIBRARY DESTINATION lib)
endif()

install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd
        DESTINATION include)


================================================
FILE: CONTRIBUTING.md
================================================
<!--

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-->

## How to Contribute to `nsimd`?

You are welcome to contribute to `nsimd`. This document gives some details on
how to add/wrap new intrinsics. When you have finished fixing some bugs or
adding some new features, please make a pull request. One of our repository
maintainer will then merge or comment the pull request.


##  Prerequisites

- Respect the philosophy of the library (see [index](index.md).)
- Basic knowledge of Python 3.
- Good knowledge of C.
- Good knowledge of C++.
- Good knowledge of SIMD programming.

## How Do I Add Support for a New Intrinsic?

### Introduction

`nsimd` currently supports the following architectures:
- `CPU`:
  + `CPU` called `CPU` in source code. This "extension" is not really one as it
    is only present so that code written with `nsimd` can compile and run on
    targets not supported by `nsimd` or with no SIMD.
- Intel:
  + `SSE2` called `SSE2` in source code.
  + `SSE4.2` called `SSE42` in source code.
  + `AVX` called `AVX` in source code.
  + `AVX2` called `AVX2` in source code.
  + `AVX-512` as found on KNLs called `AVX512_KNL` in source code.
  + `AVX-512` as found on Xeon Skylake CPUs called `AVX512_SKYLAKE` in source
    code.
- Arm
  + `NEON` 128 bits as found on ARMv7 CPUs called `NEON128` in source code.
  + `NEON` 128 bits as found on Aarch64 CPUs called `AARCH64` in source code.
  + `SVE` called `SVE` in source code.
  + `SVE` 128 bits known at compiled time called `SVE128` in source code.
  + `SVE` 256 bits known at compiled time called `SVE256` in source code.
  + `SVE` 512 bits known at compiled time called `SVE512` in source code.
  + `SVE` 1024 bits known at compiled time called `SVE1024` in source code.
  + `SVE` 2048 bits known at compiled time called `SVE2048` in source code.
- IBM POWERPC
  + `VMX` 128 bits as found on POWER6 CPUs called `VMX` in source code.
  + `VSX` 128 bits as found on POWER7/8 CPUs called `VSX` in source code.
- NVIDIA
  + `CUDA` called `CUDA` in source code
- AMD
  + `ROCm` called `ROCM` in source code
- Intel oneAPI
  + `oneAPI` called `ONEAPI` in source code

`nsimd` currently supports the following types:
- `i8`: signed integers over 8 bits (usually `signed char`),
- `u8`: unsigned integers over 8 bits (usually `unsigned char`),
- `i16`: signed integers over 16 bits (usually `short`),
- `u16`: unsigned integers over 16 bits (usually `unsigned short`),
- `i32`: signed integers over 32 bits (usually `int`),
- `u32`: unsigned integers over 32 bits (usually `unsigned int`),
- `i64`: signed integers over 64 bits (usually `long`),
- `u64`: unsigned integers over 64 bits (usually `unsigned long`),
- `f16`: floating point numbers over 16 bits in IEEE format called `float16`
  in the rest of this document
  (<https://en.wikipedia.org/wiki/Half-precision_floating-point_format>),
- `f32`: floating point numbers over 32 bits (usually `float`)
- `f64`: floating point numbers over 64 bits (usually `double`),

As C and C++ do not support `float16`, `nsimd` provides its own types to handle
them. Therefore special care has to be taken when implementing
intrinsics/operators on architecures that do not natively supports them.

We will make the following misuse of language in the rest of this document.
The type taken by intrinsics is of course a SIMD vector and more precisely a
SIMD vector of chars or a SIMD vector of `short`s or a SIMD vector of `int`s…
Therefore when we will talk about an intrinsic, we will say that it takes
type `T` as arguments when it takes in fact a SIMD vector of `T`.

### Our imaginary intrinsic

We will add support to the library for the following imaginary intrinsic: given
a SIMD vector, suppose that this intrisic called `foo` takes each element `x`
of the vector and compute `1 / (1 - x) + 1 / (1 - x)^2`. Moreover suppose that
hardware vendors all propose this intrisic only for floatting point numbers as
follows:
- CPU (no intrinsics is given of course in standard C and C++)
- Intel (no intrinsics is given for `float16`s)
  + `SSE2`: no intrinsics is provided.
  + `SSE42`: `_mm_foo_ps` for `float`s and `_mm_foo_pd` for `double`s.
  + `AVX`: no intrinsics is provided.
  + `AVX2`: `_mm256_foo_ps` for `float`s and `_mm256_foo_pd` for `double`s.
  + `AVX512_KNL`: no intrinsics is provided.
  + `AVX512_SKYLAKE`: `_mm512_foo_ps` for `float`s and `_mm512_foo_pd` for
    `double`s.
- ARM
  + `NEON128`: `vfooq_f16` for `float16`s, `vfooq_f32` for `float`s and no
    intrinsics for `double`s.
  + `AARCH64`: same as `NEON128` but `vfooq_f64` for doubles.
  + `SVE`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
    `float16`s, `float`s and `double`s.
  + `SVE128`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
    `float16`s, `float`s and `double`s.
  + `SVE256`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
    `float16`s, `float`s and `double`s.
  + `SVE512`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
    `float16`s, `float`s and `double`s.
  + `SVE1024`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
    `float16`s, `float`s and `double`s.
  + `SVE2048`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
    `float16`s, `float`s and `double`s.
- IBM POWERPC
  + `VMX`: `vec_foo` for `float`s and no intrinsics for `double`s.
  + `VSX`: `vec_foo` for `float`s and `double`s.
- NVIDIA
  + `CUDA`: no intrinsics is provided.
- AMD
  + `ROCM`: no intrinsics is provided.
- Intel oneAPI
  + `ONEAPI`: no intrinsics is provided.

First thing to do is to declare this new intrinsic to the generation system.
A lot of work is done by the generation system such as generating all functions
signatures for C and C++ APIs, tests, benchmarks and documentation. Of course
the default documentation does not say much but you can add a better
description.

### Registering the intrinsic (or operator)

A function or an intrinsic is called an operator in the generation system.
Go at the bottom of `egg/operators.py` and add the following just after
the `Rsqrt11` class.

```python
class Foo(Operator):
    full_name = 'foo'
    signature = 'v foo v'
    types = common.ftypes
    domain = Domain('R\{1}')
    categories = [DocBasicArithmetic]
```

This little class will be processed by the generation system so that operator
`foo` will be available for the end-user of the library in both C and C++ APIs.
Each member of this class controls how the generation is be done:
- `full_name` is a string containing the human readable name of the operator.
  If not given, the class name will be taken for it.
- `signature` is a string describing what kind of arguments and how many takes
  the operator. This member is mandatory and must respect the following syntax:
  `return_type name_of_operator arg1_type arg2_type ...` where `return_type`
  and the `arg*_type` can be taken from the following list:
  + `v   ` SIMD vector parameter
  + `vx2 ` Structure of 2 SIMD vector parameters
  + `vx3 ` Structure of 3 SIMD vector parameters
  + `vx4 ` Structure of 4 SIMD vector parameters
  + `l   ` SIMD vector of logicals parameter
  + `s   ` Scalar parameter
  + `*   ` Pointer to scalar parameter
  + `c*  ` Pointer to const scalar parameter
  + `_   ` void (only for return type)
  + `p   ` Parameter (integer)

In our case `v foo v` means that `foo` takes one SIMD vector as argument and
returns a SIMD vector as output. Several signatures will be generated for this
intrinsic according to the types it can supports. In our case the intrinsic
only support floatting point types.
- `types` is a Python list indicating which types are supported by the
  intrinsic. If not given, the intrinsic is supposed to support all types.
  Some Python lists are predefined to help the programmer:
  + `ftypes = ['f64', 'f32', 'f16']       ` All floatting point types
  + `ftypes_no_f16 = ['f64', 'f32']       `
  + `itypes = ['i64', 'i32', 'i16', 'i8'] ` All signed integer types
  + `utypes = ['u64', 'u32', 'u16', 'u8'] ` All unsigned integer types
  + `iutypes = itypes + utypes`
  + `types = ftypes + iutypes`
- `domain` is a string indicating the mathematical domain of definition of the
  operator. This helps for benchmarks and tests for generating random numbers
  as inputs in the correct interval. In our case `R\{1}` means all real numbers
  (of course all floating point numbers) expect `-1` for which the operator
  cannot be computed. For examples see how other operators are defined in
  `egg/operators.py`.
- `categories` is a list of Python classes that indicates the generation
  system to which categories `foo` belongs. The list of available categories
  is as follow:
  + `DocShuffle          ` for Shuffle functions
  + `DocTrigo            ` for Trigonometric functions
  + `DocHyper            ` for Hyperbolic functions
  + `DocExpLog           ` for Exponential and logarithmic functions
  + `DocBasicArithmetic  ` for Basic arithmetic operators
  + `DocBitsOperators    ` for Bits manipulation operators
  + `DocLogicalOperators ` for Logicals operators
  + `DocMisc             ` for Miscellaneous
  + `DocLoadStore        ` for Loads & stores
  + `DocComparison       ` for Comparison operators
  + `DocRounding         ` for Rounding functions
  + `DocConversion       ` for Conversion operators
  If no category corresponds to the operator you want to add to `nsimd` then feel
  free to create a new category (see the bottom of this document)

Many other members are supported by the generation system. We describe them
quickly here and will give more details in a later version of this document.
Default values are given in square brakets:
- `cxx_operator [= None]` in case the operator has a corresponding C++ operator.
- `autogen_cxx_adv [= True]` in case the C++ advanced API signatures for this
  operator must not be auto-generated.
- `output_to [= common.OUTPUT_TO_SAME_TYPE]` in case the operator output type
  differs from its input type. Possible values are:
  + `OUTPUT_TO_SAME_TYPE`: output is of same type as input.
  + `OUTPUT_TO_SAME_SIZE_TYPES`: output can be any type of same bit size.
  + `OUTPUT_TO_UP_TYPES`: output can be any type of bit size twice the bit
    bit size of the input. In this case the input type will never be a 64-bits
    type.
  + `OUTPUT_TO_DOWN_TYPES`: output can be any type of bit size half the bit
    bit size of the input. In this case the input type will never be a 8-bits
    type.
- `src [= False]` in case the code must be compiled in the library.
- `load_store [= False]` in case the operator loads/store data from/to
  memory.
- `do_bench [= True]` in case benchmarks for the operator must not be
  auto-generated.
- `desc [= '']` description (in Markdown format) that will appear in the
  documentation for the operator.
- `bench_auto_against_cpu [= True]` for auto-generation of benchmark against
  `nsimd` CPU implementation.
- `bench_auto_against_mipp [= False]` for auto-generation of benchmark against
  the MIPP library.
- `bench_auto_against_sleef [= False]` for auto-generation of benchmark against
  the Sleef library.
- `bench_auto_against_std [= False]` for auto-generation of benchmark against
  the standard library.
- `tests_mpfr [= False]` in case the operator has an MPFR counterpart for
  comparison, then test the correctness of the operator against it.
- `tests_ulps [= False]` in case the auto-generated tests has to compare ULPs
  (<https://en.wikipedia.org/wiki/Unit_in_the_last_place>).
- `has_scalar_impl [= True]` in case the operator has a CPU scalar and GPU
  implementation.

### Implementing the operator

Now that the operator is registered, all signatures will be generated but
the implemenatations will be missing. Type

```sh
python3 egg/hatch.py -lf
```

and the following files (among many other) should appear:
- `include/nsimd/cpu/cpu/foo.h`
- `include/nsimd/x86/sse2/foo.h`
- `include/nsimd/x86/sse42/foo.h`
- `include/nsimd/x86/avx/foo.h`
- `include/nsimd/x86/avx2/foo.h`
- `include/nsimd/x86/avx512_knl/foo.h`
- `include/nsimd/x86/avx512_skylake/foo.h`
- `include/nsimd/arm/neon128/foo.h`
- `include/nsimd/arm/aarch64/foo.h`
- `include/nsimd/arm/sve/foo.h`
- `include/nsimd/arm/sve128/foo.h`
- `include/nsimd/arm/sve256/foo.h`
- `include/nsimd/arm/sve512/foo.h`
- `include/nsimd/arm/sve1024/foo.h`
- `include/nsimd/arm/sve2048/foo.h`
- `include/nsimd/ppc/vmx/foo.h`
- `include/nsimd/ppc/vsx/foo.h`

They each correspond to the implementations of the operator for each supported
architectures. When openening one of these files the implementations in plain
C and then in C++ (falling back to the C function) should be there but all the
C implementations are reduced to `abort();`. This is the default when none is
provided. Note that the "cpu" architecture is just a fallback involving no
SIMD at all. This is used on architectures not supported by `nsimd` or when the
architectures does not offer any SIMD.

Providing implementations for `foo` is done by completing the following Python
files:

- `egg/platform_cpu.py`
- `egg/platform_x86.py`
- `egg/platform_arm.py`
- `egg/platform_ppc.py`
- `egg/scalar.py`
- `egg/cuda.py`
- `egg/hip.py`
- `egg/oneapi.py`

The idea is to produce plain C (not C++) code using Python string format. Each
of the Python files provides some helper functions to ease as much as
possible the programmer's job. But every file provides the same "global"
variables available in every functions and is designed in the same way:

1. At the bottom of the file is the `get_impl` function taking the following
   arguments:
   + `func     ` the name of the operator the system is currently
     auto-generating.
   + `simd_ext ` the SIMD extension for which the system wants the
     implemetation.
   + `from_typ ` the input type of the argument that will be passed to the
     operator.
   + `to_typ   ` the output type produced by the operator.
2. Inside this function lies a Python dictionary that provides functions
   implementing each operator. The string containing the C code for the
   implementations can be put here directly but usually the string is
   returned by a Python function that is written above in the same file.
3. At the top of the file lies helper functions that helps generating code.
   This is specific to each architecture. Do not hesitate to look at it.

Let's begin by the `cpu` implementations. It turns out that there is no SIMD
extension in this case, and by convention, `simd_ext == 'cpu'` and this
argument can therefore be ignored. So we first add an entry to the `impls`
Python dictionary of the `get_impl` function:

```python
    impls = {

        ...

        'reverse': reverse1(from_typ),
        'addv': addv(from_typ),
        'foo': foo1(from_typ) # Added at the bottom of the dictionary
    }
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

    ...
```

Then, above in the file we write the Python function `foo1` that will provide
the C implementation of operator `foo`:

```python
def foo1(typ):
    return func_body(
           '''ret.v{{i}} = ({typ})1 / (({typ})1 - {in0}.v{{i}}) +
                           ({typ})1 / ((({typ})1 - {in0}.v{{i}}) *
                                       (({typ})1 - {in0}.v{{i}}));'''. \
                                       format(**fmtspec), typ)
```

First note that the arguments names passed to the operator in its C
implementation are not known in the Python side. Several other parameters
are not known or are cumbersome to find out. Therefore each function has access
to the `fmtspec` Python dictionary that hold some of these values:
- `in0`: name of the first parameter for the C implementation.
- `in1`: name of the second parameter for the C implementation.
- `in2`: name of the third parameter for the C implementation.
- `simd_ext`: name of the SIMD extension (for the cpu architecture, this is
  equal to `"cpu"`).
- `from_typ`: type of the input.
- `to_typ`: type of the output.
- `typ`: equals `from_typ`, shorter to write as usually `from_typ == to_typ`.
- `utyp`: bitfield type of the same size of `typ`.
- `typnbits`: number of bits in `typ`.

The CPU extension can emulate 64-bits or 128-bits wide SIMD vectors. Each type
is a struct containing as much members as necessary so that `sizeof(T) *
(number of members) == 64 or 128`. In order to avoid the developper to write
two cases (64-bits wide and 128-bits wide) the `func_body` function is provided
as a helper. Note that the index `{{i}}` is in double curly brackets to go
through two Python string formats:

1. The first pass is done within the `foo1` Python function and replaces
   `{typ}` and `{in0}`. In this pass `{{i}}` is formatted into `{i}`.
2. The second pass is done by the `func_body` function which unrolls the string
   to the necessary number and replace `{i}` by the corresponding number. The
   produced C code will look like one would written the same statement for each
   members of the input struct.

Then note that as plain C (and C++) does not support native 16-bits wide
floating point types `nsimd` emulates it with a C struct containing 4 floats
(32-bits swide floatting point numbers). In some cases extra care has to be
taken to handle this type.

For each SIMD extension one can find a `types.h` file (for `cpu` the files can
be found in `include/nsimd/cpu/cpu/types.h`) that declares all SIMD types. If
you have any doubt on a given type do not hesitate to take a look at this file.
Note also that this file is auto-generated and is therefore readable only after
a successfull first `python3 egg/hatch -Af`.

Now that the `cpu` implementation is written, you should be able to write the
implementation of `foo` for other architectures. Each architecture has its
particularities. We will cover them now by providing directly the Python
implementations and explaining in less details.

Finally note that `clang-format` is called by the generation system to
autoformat produced C/C++ code. Therefore prefer indenting C code strings within
the Python according to Python indentations, do not write C code beginning at
column 0 in Python files.

### For Intel

```python
def foo1(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v1 = {pre}foo_ps({in0}.v1);
                  ret.v2 = {pre}foo_ps({in0}.v2);
                  return ret;'''.format(**fmtspec)
    if simd_ext == 'sse2':
        return emulate_op1('foo', 'sse2', typ)
    if simd_ext in ['avx', 'avx512_knl']:
        return split_opn('foo', simd_ext, typ, 1)
    return 'return {pre}foo{suf}({in0});'.format(**fmtspec)
```

Here are some notes concerning the Intel implementation:

1. `float16`s are emulated with two SIMD vectors of `float`s.
2. When the intrinsic is provided by Intel one can access it easily by
   constructing it with `{pre}` and `{suf}`. Indeed all Intel intrinsics
   names follow a pattern with a prefix indicating the SIMD extension and a
   suffix indicating the type of data. As for `{in0}`,  `{pre}` and
   `{suf}` are provided and contain the correct values with respect to
   `simd_ext` and `typ`, you do not need to compute them yourself.
3. When the intrinsic is not provided by Intel then one has to use tricks.
   + For `SSE2` one can use complete emulation, that is, putting the content of
     the SIMD vector into a C-array, working on it with a simple for loop and
     loading back the result into the resulting SIMD vector. As said before a
     lot of helper functions are provided and the `emulate_op1` Python function
     avoid writing by hand this for-loop emulation.
   + For `AVX` and `AVX512_KNL`, one can fallback to the "lower" SIMD extension
     (`SSE42` for `AVX` and `AVX2` for `AVX512_KNL`) by splitting the input
     vector into two smaller vectors belonging to the "lower" SIMD extension. In
     this case again the tedious and cumbersome work is done by the `split_opn`
     Python function.
4. Do not forget to add the `foo` entry to the `impls` dictionary in the `get_impl`
   Python function.

### For ARM

```python
def foo1(simd_ext, typ):
    ret = f16f64(simd_ext, typ, 'foo', 'foo', 1)
    if ret != '':
        return ret
    if simd_ext in neon:
        return 'return vfooq_{suf}({in0});'.format(**fmtspec)
    else:
        return 'return svfoo_{suf}_z({svtrue}, {in0});'.format(**fmtspec)
```

Here are some notes concerning the ARM implementation:

1. `float16`s can be natively supported but this is not mandatory.
2. On 32-bits ARM chips, intrinsics on `double` almost never exist.
3. The Python helper function `f16f64` hides a lot of details concerning the
   above two points. If the function returns a non empty string then it means
   that the returned string contains C code to handle the case given by the
   pair `(simd_ext, typ)`. We advise you to look at the generated C code. You
   will see the `nsimd_FP16` macro used. When defined it indicates that `nsimd`
   is compiled with native `float16` support. This also affect SIMD types (see
   `nsimd/include/arm/*/types.h`.)
4. Do not forget to add the `foo` entry to the `impls` dictionary in the
   `get_impl` Python function.

### For IBM POWERPC

```python
def foo1(simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v'])
    else:
        return 'return vec_foo({in0});'.format(**fmtspec)
```

Here are some notes concerning the PPC implementation:

1. For VMX, intrinsics on `double` almost never exist.
2. The Python helper function `has_to_be_emulated` returns `True` when the
   implementation of `foo` concerns float16 or `double`s for `VMX`. When this
   function returns True you can then use `emulation_code`.
3. The `emulation_code` function returns a generic implementation of an
   operator. However this iplementation is not suitable for any operator
   and the programmer has to take care of that.
4. Do not forget to add the `foo` entry to the `impls` dictionary in the
   `get_impl` Python function.

### The scalar CPU version

```python
def foo1(func, typ):
    normal = \
    'return ({typ})(1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0})));'. \
    if typ == 'f16':
        return \
        '''#ifdef NSIMD_NATIVE_FP16
             {normal}
           #else
             return nsimd_f32_to_f16({normal_fp16});
           #endif'''. \
           format(normal=normal.format(**fmtspec),
                  normal_fp16=normal.format(in0='nsimd_f16_to_f32({in0})))
    else:
        return normal.format(**fmtspec)
```

The only caveat for the CPU scalar implementation is to handle float16
correctly. The easiest way to do is to have the same implementation as float32
but replacing `{in0}`'s by `nsimd_f16_to_f32({in0})`'s and converting back
the float32 result to a float16.

### The GPU versions

The GPU generator Python files `cuda.py`, `rocm.py` and `oneapi.py` are a bit
different from the other files but it is easy to find where to add the relevant
pieces of code. Note that ROCm syntax is fully compatible with CUDA's one only
needs to modify the `cuda.py` file while it easy to understand `oneapi.py`.

The code to add for float32's is as follows to be added inside the `get_impl`
Python function.

```python
return '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec)
```

The code for CUDA and ROCm to add for float16's is as follows. It has to be
added inside the `get_impl_f16` Python function.

```python
arch53_code = '''__half one = __float2half(1.0f);
                 return __hadd(
                               __hdiv(one, __hsub(one, {in0})),
                               __hmul(
                                      __hdiv(one, __hsub(one, {in0})),
                                      __hdiv(one, __hsub(one, {in0}))
                                     )
                              );'''.format(**fmtspec)
```

As Intel oneAPI natively support float16's the code is the same as the one
for floats:

```python
return '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec)
```

### Implementing the test for the operator

Now that we have written the implementations for the `foo` operator we must
write the corresponding tests. For tests all generations are done by
`egg/gen_tests.py`. Writing tests is more simple. The intrinsic that we just
implemented can be tested by an already-written test pattern code, namely by
the `gen_test` Python function.

Here is how the `egg/gen_tests.py` is organized:

1. The entry point is the `doit` function located at the bottom of the file.
2. In the `doit` function a dispatching is done according to the operator that
   is to be tested. All operators cannot be tested by the same C/C++ code. The
   reading of all different kind of tests is rather easy and we are not going
   through all the code in this document.
3. All Python functions generating test code begins with the following:
   ```python
       filename = get_filename(opts, op, typ, lang)
       if filename == None:
           return
   ```
   This must be the case for newly created function. The `get_filename` function
   ensures that the file must be created with respect to the command line
   options given to the `egg/hatch.py` script. Then note that to output to a
   file the Python function `open_utf8` must be used to handle Windows and to
   automatically put the MIT license at the beginning of generated files.
4. Tests must be written for C base API, the C++ base API and the C++ advanced
   API.

If you need to create a new kind of tests then the best way is to copy-paste
the Python function that produces the test that resembles the most to the test
you want. Then modify the newly function to suit your needs. Here is a quick
overview of Python functions present in the `egg/gen_test.py` file:
- `gen_nbtrue`, `gen_adv`, `gen_all_any` generate tests for reduction operators.
- `gen_reinterpret_convert` generates tests for non closed operators.
- `gen_load_store` generates tests for load/store operators.
- `gen_reverse` generates tests for one type of shuffle but can be extended
  for other kind of shuffles.
- `gen_test` generates tests for "standard" operators, typically those who do
  some computations. This is the kind of tests that can handle our `foo`
  operator and therefore nothing has to be done on our part.

## Not all tests are to be done

As explained in <how_tests_are_done.md> doing all tests is not recommanded.
Take for example the `cvt` operator. Testing `cvt` from say `f32` to `i32`
is complicated as the result depends on how NaN, infinities are handled and
on the current round mode. In turn these prameters depends on the vendor, the
chip, the bugs in the chip, the chosen rounding mode by users or other
softwares...

The function `should_i_do_the_test` gives an hint on whether to implement the
test or not. Its code is really simple and you may need to modify it. The
listing below is a possible implementation that takes care of the case
described in the previous paragraph.

```python
def should_i_do_the_test(operator, tt='', t=''):
    if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes:
        # When converting from float to int to float then we may not
        # get the initial result because of roundings. As tests are usually
        # done by going back and forth then both directions get tested in the
        # end
        return False
    if operator.name == 'reinterpret' and t in common.iutypes and \
       tt in common.ftypes:
        # When reinterpreting from int to float we may get NaN or infinities
        # and no ones knows what this will give when going back to ints
        # especially when float16 are emulated. Again as tests are done by
        # going back and forth both directions get tested in the end.
        return False
    if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \
       t == 'f16':
        # Bit operations on float16 are hard to check because they are
        # emulated in most cases. Therefore going back and forth with
        # reinterprets for doing bitwise operations make the bit in the last
        # place to wrong. This is normal but makes testing real hard. So for
        # now we do not test them on float16.
        return False
    if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail',
                         'loadu', 'loada', 'storeu', 'storea', 'loadla',
                         'loadlu', 'storela', 'storelu', 'if_else1']:
        # These functions are used in almost every tests so we consider
        # that they are extensively tested.
        return False
    if operator.name in ['store2a', 'store2u', 'store3a', 'store3u',
                         'store4a', 'store4u', 'scatter', 'scatter_linear',
                         'downcvt', 'to_logical']:
        # These functions are tested along with their load counterparts.
        # downcvt is tested along with upcvt and to_logical is tested with
        # to_mask
        return False
    return True
```

### Conclusion

At first sight the implementation of `foo` seems complicated because intrinsics
for all types and all architectures are not provided by vendors. But `nsimd`
provides a lot of helper functions and tries to put away details so that
wrapping intrinsics is quickly done and easy, the goal is that the programmer
concentrate on the implementation itself. But be aware that more complicated
tricks can be implemented. Browse through a `platform_*.py` file to see what
kind of tricks are used and how they are implemented.


## How do I add a new category?

Adding a category is way much simplier than an operator. It suffices to add
a class with only one member named `title` as follows:
```python
class DocMyCategoryName(DocCategory):
    title = 'My category name functions'
```

The class must inherit from the `DocCategory` class and its name must begin
with `Doc`. The system will then take it into account, generate the entry
in the documentation and so on.

## How to I add a new module?

A module is a set of functionnalities that make sense to be provided alongside
NSIMD but that cannot be part of NSIMD's core. Therefore it is not mandatory
to provide all C and C++ APIs versions or to support all operators. For what
follows let's call the module we want to implement `mymod`.

Include files (written by hand or generated by Python) must be placed into
the `nsimd/include/nsimd/modules/mymod` directory and a master header file must
be placed at `nsimd/include/nsimd/modules/mymod.h`. You are free to organize
the `nsimd/include/nsimd/modules/mymod` folder as you see fit.

Your module has to be found by NSIMD generation system. For this you must
create the `nsimd/egg/modules/mymod` directory and
`nsimd/egg/modules/mymod/hatch.py` file. The latter must expose the following
functions:

- `def name()`  
  Return a human readable module name beginning with a uppercase letter.

- `def desc()`  
  Return a small description of 4-5 lines of text for the module. This text
  will appear in the `modules.md` file that lists all the available modules.

- `def doc_menu()`  
  Return a Python dictionnary containing the menu for when the generation
  system produces the HTML pages of documentation for the module. The entry
  markdown file must be `nsimd/doc/markdown/module_mymod_overview.md` for
  module documentation. Then  if your module has no other documentation
  pages this function can simply returns `dict()`. Otherwise if has to return
  `{'menu_label': 'filename_suffix', ...}` where `menu_label` is a menu entry
  to be displayed and pointing to `nsimd/egg/module_mymod_filename_suffix.md`.
  Several fucntion in `egg/common.py` (`import common`) have to be used to
  ease crafting documentation pages filenames:
  + `def get_markdown_dir(opts)`  
    Return the folder into which markdown for documentation have to be put.
  + `def get_markdown_file(opts, name, module='')`  
    Return the filename to be passed to the `common.open_utf8` function. The
    `name` argument acts as a suffix as explained above while the `module`
    argument if the name of the module.
  
- `def doit(opts)` 
  Is the real entry point of the module. This function has the responsability
  to generate all the code for your module. It can of course import all Python
  files from NSIMD and take advantage of the `operators.py` file. To
  respect the switches passed by the user at command line it is recommanded to
  write this function as follows.

  ```python
  def doit(opts):
      common.myprint(opts, 'Generating module mymod')
      if opts.library:
          gen_module_headers(opts)
      if opts.tests:
          gen_tests(opts)
      if opts.doc:
          gen_doc(opts)
  ```

Tests for the module have to be put into the `nsimd/tests/mymod` directory.

## How to I add a new platform?

The list of supported platforms is determined by looking in the `egg`
directory and listing all `platform_*.py` files. Each file must contain all
SIMD extensions for a given platform. For example the default (no SIMD) is
given by `platform_cpu.py`. All the Intel SIMD extensions are given by
`platform_x86.py`.

Each Python file that implements a platform must be named
`platform_[name for platform].py` and must export at least the following
functions:

- `def get_simd_exts()`  
  Return the list of SIMD extensions implemented by this file as a Python
  list.

- `def get_prev_simd_ext(simd_ext)`  
  Usually SIMD extensions are added over time by vendors and a chip
  implementing  a SIMD extension supports previous SIMD extension. This
  function must return the previous SIMD extension supported by the vendor if
  it exists otherwise it must return the empty string. Note that `cpu` is the
  only SIMD extensions that has no previous SIMD extensions. Every other SIMD
  extension has at least `cpu` as previous SIMD extension.

- `def get_native_typ(simd_ext, typ)`  
  Return the native SIMD type corresponding of the SIMD extension `simd_ext`
  whose elements are of type `typ`. If `typ` or `simd_ext` is not known then a
  ValueError exception must be raised.

- `def get_type(simd_ext, typ)`  
  Returns the "intrinsic" SIMD type corresponding to the given
  arithmetic type. If `typ` or `simd_ext` is not known then a ValueError
  exception must be raised.

- `def get_additional_include(func, simd_ext, typ)`  
  Returns additional include if need be for the implementation of `func` for
  the given `simd_ext` and `typ`.

- `def get_logical_type(simd_ext, typ)`  
  Returns the "intrinsic" logical SIMD type corresponding to the given
  arithmetic type. If `typ` or `simd_ext` is not known then a ValueError
  exception must be raised.

- `def get_nb_registers(simd_ext)`  
  Returns the number of registers for this SIMD extension.

- `def get_impl(func, simd_ext, from_typ, to_typ)`  
  Returns the implementation (C code) for `func` on type `typ` for `simd_ext`.
  If `typ` or `simd_ext` is not known then a ValueError exception must be
  raised. Any `func` given satisfies `S func(T a0, T a1, ... T an)`.

- `def has_compatible_SoA_types(simd_ext)`  
  Returns True iff the given `simd_ext` has structure of arrays types
  compatible with NSIMD i.e. whose members are v1, v2, ... Returns False
  otherwise. If `simd_ext` is not known then a ValueError exception must be
  raised.

- `def get_SoA_type(simd_ext, typ, deg)`  
  Returns the structure of arrays types for the given `typ`, `simd_ext` and
  `deg`. If `simd_ext` is not known or does not name a type whose
  corresponding SoA types are compatible with NSIMD then a ValueError
  exception must be raised.

- `def emulate_fp16(simd_ext)`
  Returns True iff the given SIMD extension has to emulate FP16's with
  two FP32's.

Then you are free to implement the SIMd extensions for the platform. See above
on how to add the implementations of operators.


================================================
FILE: LICENSE
================================================
Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
Documentation can be found [here](https://agenium-scale.github.io/nsimd/).
We put a lot of effort into
[testing](https://agenium-scale.github.io/nsimd/how_tests_are_done.html).

# What is NSIMD?

At its core, NSIMD is a vectorization library that abstracts [SIMD
programming](<https://en.wikipedia.org/wiki/SIMD>). It was designed to exploit
the maximum power of processors at a low development cost. NSIMD comes with
modules. As of now two of them adds support for GPUs to NSIMD. The
direction that NSIMD is taking is to provide several programming paradigms
to address different problems and to allow a wider support of architectures.
With two of its modules NSIMD provides three programming paradigms:

- Imperative programming provided by NSIMD core that supports a lots of
  CPU/SIMD extensions.
- Expressions templates provided by the TET1D module that supports all
  architectures from NSIMD core and adds support for NVIDIA and AMD GPUs.
- Single Program Multiple Data provided by the SPMD module that supports all
  architectures from NSIMD core and adds support for NVIDIA and AMD GPUs.

## Supported architectures

| Architecture                          | NSIMD core | TET1D module | SPMD module |
|:--------------------------------------|:----------:|:------------:|:-----------:|
| CPU (scalar functions)                |     Y      |      Y       |      Y      |
| CPU (128-bits SIMD emulation)         |     Y      |      Y       |      Y      |
| Intel SSE 2                           |     Y      |      Y       |      Y      |
| Intel SSE 4.2                         |     Y      |      Y       |      Y      |
| Intel AVX                             |     Y      |      Y       |      Y      |
| Intel AVX2                            |     Y      |      Y       |      Y      |
| Intel AVX-512 for KNLs                |     Y      |      Y       |      Y      |
| Intel AVX-512 for Skylake processors  |     Y      |      Y       |      Y      |
| Arm NEON 128 bits (ARMv7 and earlier) |     Y      |      Y       |      Y      |
| Arm NEON 128 bits (ARMv8 and later)   |     Y      |      Y       |      Y      |
| Arm SVE (original sizeless SVE)       |     Y      |      Y       |      Y      |
| Arm fixed sized SVE                   |     Y      |      Y       |      Y      |
| IBM POWERPC VMX                       |     Y      |      Y       |      Y      |
| IBM POWERPC VSX                       |     Y      |      Y       |      Y      |
| NVIDIA CUDA                           |     N      |      Y       |      Y      |
| AMD ROCm                              |     N      |      Y       |      Y      |
| Intel oneAPI                          |     N      |      Y       |      Y      |

## Contributions

| Contributor          | Contribution(s)                                   |
|:---------------------|:--------------------------------------------------|
| Guillaume Quintin    | Maintainer + main contributor                     |
| Alan Kelly           | Arm NEON + mathematical functions                 |
| Kenny Péou           | Fixed point module                                |
| Xavier Berault       | PowerPC VMX and VSX                               |
| Vianney Stricher     | NSIMD core + oneAPI in SPMD and TET1D modules     |
| Quentin Khan         | Soa/AoS loads and stores                          |
| Paul Gannay          | PowerPC VMX, VSX + testing system                 |
| Charly Chevalier     | Benchmarking system + Python internals            |
| Erik Schnetter       | Fixes + code generation                           |
| Lénaïc Bagnères      | Fixes + TET1D module                              |
| Jean-Didier Pailleux | Shuffles operators                                |

## How it works?

To achieve maximum performance, NSIMD mainly relies on the inline optimization
pass of the compiler. Therefore using any mainstream compiler such as GCC,
Clang, MSVC, XL C/C++, ICC and others with NSIMD will give you a zero-cost SIMD
abstraction library.

To allow inlining, a lot of code is placed in header files. *Small* functions
such as addition, multiplication, square root, etc, are all present in header
files whereas big functions such as I/O are put in source files that are
compiled as a `.so`/`.dll` library.

NSIMD provides C89, C11, C++98, C++11, C++14 and C++20 APIs. All APIs allow
writing generic code. For the C API this is achieved through a thin layer of
macros and with the `_Generic` keyword for the C advanced API; for the C++ APIs
it is achieved using templates and function overloading. The C++ APIs are split
into two. The first part is a C-like API with only function calls and direct
type definitions for SIMD types while the second one provides operator
overloading, higher level type definitions that allows unrolling.  C++11, C++14
APIs add for instance templated type definitions and templated constants while
the C++20 API uses concepts for better error reporting.

Binary compatibility is guaranteed by the fact that only a C ABI is exposed.
The C++ API only wraps the C calls.

## Supported compilers

NSIMD is tested with GCC, Clang, MSVC, NVCC, HIPCC and ARMClang. As a C89 and a
C++98 API are provided, other compilers should work fine. Old compiler versions
should work as long as they support the targeted SIMD extension. For instance,
NSIMD can compile SSE 4.2 code with MSVC 2010.

# Build the library

## CMake

As CMake is widely used as a build system, we have added support for building
the library only and the corresponding find module.

```sh
mkdir build
cd build
cmake .. -Dsimd=SIMD_EXT
make
make install
```

where `SIMD_EXT` is one of the following: CPU, SSE2, SSE42, AVX, AVX2,
AVX512\_KNL, AVX512\_SKYLAKE, NEON128, AARCH64, SVE, SVE128, SVE256, SVE512,
SVE1024, SVE2048, VMX, VSX, CUDA, ROCM.

Note that when compiling for NEON128 on Linux one has to choose the ABI, either
armel or armhf. Default is armel. As CMake is unable to autodetect this
parameter one has to tell CMake manually.

```sh
cmake .. -Dsimd=neon128                               # for armel
cmake .. -Dsimd=neon128 -DNSIMD_ARM32_IS_ARMEL=OFF    # for armhf
```

We provide in the `scripts` directory a CMake find module to find NSIMD on
your system. One can let the module find NSIMD on its own, if several
versions for different SIMD extensions of NSIMD are installed then the
module will find and return one. There is no guaranty on which versions will
be chosen by the module.

```cmake
find_package(NSIMD)
```

If one wants a specific version of the library for a given SIMD extension then
use the `COMPONENTS` part of `find_package`. Only one component is supported
at a time.

```cmake
find_package(NSIMD COMPONENTS avx2)         # find only NSIMD for Intel AVX2
find_package(NSIMD COMPONENTS sve)          # find only NSIMD for Arm SVE
find_package(NSIMD COMPONENTS sse2 sse42)   # unsupported
```

## Nsconfig

The support for CMake has been limited to building the library only. If you
wish to run tests or contribute you need to use nsconfig as CMake has several
flaws:
- too slow especially on Windows,
- inability to use several compilers at once,
- inability to have a portable build system,
- very poor support for portable compilation flags,
- ...

## Dependencies (nsconfig only)

Generating C/C++ files is done by the Python3 code contained in the `egg`.
Python should be installed by default on any Linux distro. On Windows it comes
with the latest versions of Visual Studio on Windows
(<https://visualstudio.microsoft.com/vs/community/>), you can also download and
install it directly from <https://www.python.org/>.

The Python code can call `clang-format` to properly format all generated C/C++
source. On Linux you can install it via your package manager. On Windows you
can use the official binary at <https://llvm.org/builds/>.

Compiling the library requires a C++98 compiler. Any version of GCC, Clang or
MSVC will do. Note that the produced library and header files for the end-user
are C89, C++98, C++11 compatible. Note that C/C++ files are generated by a
bunch of Python scripts and they must be executed first before running building
the library.

## Build for Linux

```bash
bash scripts/build.sh for simd_ext1/.../simd_extN with comp1/.../compN
```

For each combination a directory `build-simd_ext-comp` will be created and
will contain the library. Supported SIMD extension are:

- sse2
- sse42
- avx
- avx2
- avx512\_knl
- avx512\_skylake
- neon128
- aarch64
- sve
- sve128
- sve256
- sve512
- sve1024
- sve2048
- vmx
- vsx
- cuda
- rocm

Supported compiler are:

- gcc
- clang
- icc
- armclang
- xlc
- dpcpp
- fcc
- cl
- nvcc
- hipcc

Note that certain combination of SIMD extension/compilers are not supported
such as aarch64 with icc, or avx512\_skylake with nvcc.

## Build on Windows

Make sure you are typing in a Visual Studio prompt. The command is almost the
same as for Linux with the same constraints on the pairs SIMD
extension/compilers.

```batch
scripts\build.bat for simd_ext1/.../simd_extN with comp1/.../compN
```

## More details on building the library

The library uses a tool called nsconfig
(<https://github.com/agenium-scale/nstools>) which is basically a Makefile
translator. If you have just built NSIMD following what's described above
you should have a `nstools` directory which contains `bin/nsconfig`. If not
you can generate it using on Linux

```bash
bash scripts/setup.sh
```

and on Windows

```batch
scripts\setup.bat
```

Then you can use `nsconfig` directly it has a syntax similar to CMake at
command line. Here is a quick tutorial with Linux command line. We first
go to the NSIMD directory and generate both NSIMD and nsconfig.

```bash
$ cd nsimd
$ python3 egg/hatch.py -ltf
$ bash scripts/setup.sh
$ mkdir build
$ cd build
```

Help can be displayed using `--help`.

```bash
$ ../nstools/bin/nsconfig --help
usage: nsconfig [OPTIONS]... DIRECTORY
Configure project for compilation.

  -v              verbose mode, useful for debugging
  -nodev          Build system will never call nsconfig
  -DVAR=VALUE     Set value of variable VAR to VALUE
  -list-vars      List project specific variable
  -GBUILD_SYSTEM  Produce files for build system BUILD_SYSTEM
                  Supported BUILD_SYSTEM:
                    make       POSIX Makefile
                    gnumake    GNU Makefile
                    nmake      Microsot Visual Studio NMake Makefile
                    ninja      Ninja build file (this is the default)
                    list-vars  List project specific variables
  -oOUTPUT        Output to OUTPUT instead of default
  -suite=SUITE    Use compilers from SUITE as default ones
                  Supported SUITE:
                    gcc       The GNU compiler collection
                    msvc      Microsoft C and C++ compiler
                    llvm      The LLVM compiler infrastructure
                    armclang  Arm suite of compilers based on LLVM
                    xlc       IBM suite of compilers
                    fcc_trad_mode
                              Fujitsu compiler in traditional mode
                    fcc_clang_mode
                              Fujitsu compiler in clang mode
                    emscripten
                              Emscripten suite for compiling into JS
                    icc       Intel C amd C++ compiler
                    rocm      Radeon Open Compute compilers
                    oneapi    Intel oneAPI compilers
                    cuda, cuda+gcc, cuda+clang, cuda+msvc
                              Nvidia CUDA C++ compiler
  -comp=COMMAND,COMPILER[,PATH[,VERSION[,ARCHI]]]
                  Use COMPILER when COMMAND is invoked for compilation
                  If VERSION and/or ARCHI are not given, nsconfig will
                  try to determine those. This is useful for cross
                  compiling and/or setting the CUDA host compiler.
                  COMMAND must be in { cc, c++, gcc, g++, cl, icc, nvcc,
                  hipcc, hcc, clang, clang++, armclang, armclang++,
                  cuda-host-c++, emcc, em++ } ;
                  VERSION is compiler dependant. Note that VERSION
                  can be set to only major number(s) in which case
                  nsconfig fill missing numbers with zeros.
                  Supported ARCHI:
                    x86      Intel 32-bits ISA
                    x86_64   Intel/AMD 64-bits ISA
                    armel    ARMv5 and ARMv6 32-bits ISA
                    armhf    ARMv7 32-bits ISA
                    aarch64  ARM 64-bits ISA
                    ppc64el  PowerPC 64-bits little entian
                    wasm32   WebAssembly with 32-bits memory indexing
                    wasm64   WebAssembly with 64-bits memory indexing
                  Supported COMPILER:
                    gcc, g++              GNU Compiler Collection
                    clang, clang++        LLVM Compiler Infrastructure
                    emcc, em++            Emscripten compilers
                    msvc, cl              Microsoft Visual C++
                    armclang, armclang++  ARM Compiler
                    xlc, xlc++            IBM Compiler
                    icc                   Intel C/C++ Compiler
                    dpcpp                 Intel DPC++ Compiler
                    nvcc                  Nvidia CUDA compiler
                    hipcc                 ROCm HIP compiler
                    fcc_trad_mode, FCC_trad_mode
                                          Fujitsu C and C++ traditionnal
                                          compiler
                    fcc_clang_mode, FCC_clang_mode
                                          Fujitsu C and C++ traditionnal
                                          compiler
  -prefix=PREFIX  Set path for installation to PREFIX
  -h, --help      Print the current help

NOTE: Nvidia CUDA compiler (nvcc) needs a host compiler. Usually on
      Linux systems it is GCC while on Windows systems it is MSVC.
      If nvcc is chosen as the default C++ compiler via the -suite
      switch, then its host compiler can be invoked in compilation
      commands with 'cuda-host-c++'. The latter defaults to GCC on Linux
      systems and MSVC on Windows systems. The user can of course choose
      a specific version and path of this host compiler via the
      '-comp=cuda-host-c++,... parameters. If nvcc is not chosen as the
      default C++ compiler but is used for compilation then its default
      C++ host compiler is 'c++'. The latter can also be customized via
      the '-comp=c++,...' command line switch.
```

Each project can defined its own set of variable controlling the generation of
the ninja file of Makefile.

```bash
$ ../nstools/bin/nsconfig .. -list-vars
Project variables list:
name             | description
-----------------|-----------------------------------
simd             | SIMD extension to use
cuda_arch_flags  | CUDA target arch flag(s) for tests
static_libstdcpp | Compile the libstdc++ statically
cpp20_tests      | Enable C++20 tests
```

Finally one can choose what to do and compile NSIMD and its tests.

```bash
$ ../nstools/bin/nsconfig .. -Dsimd=avx2
$ ninja
$ ninja tests
```

Nsconfig comes with nstest a small tool to execute tests.

```bash
$ ../nstools/bin/nstest -j20
```

## Cross compilation

It is useful to cross-compile for example when you are on a Intel workstation
and want to compile for a Raspberry Pi. Nsconfig generate some code, compile
and run it to obtain informations on the C or C++ compilers. When cross
compiling, unless you configured your Linux box with binfmt\_misc to
tranparently execute aarch64 binaries on a x86\_64 host you need to give
nsconfig all the informations about the compilers so that it does not need to
run aarch64 code on x86\_64 host.

```bash
$ ../nstools/bin/nsconfig .. -Dsimd=aarch64 \
      -comp=cc,gcc,aarch64-linux-gnu-gcc,10.0,aarch64 \
      -comp=c++,gcc,aarch64-linux-gnu-g++,10.0,aarch64
```

## Defines that control NSIMD compilation and usage

Several defines control NSIMD.

- `FMA` or `NSIMD_FMA` indicate to NSIMD that fma intrinsics can be used
  when compiling code. This is useful on Intel SSE2, SSE42, AVX and AVX2.

- `FP16` or `NSIMD_FP16` indicate to NSIMD that the targeted architecture
  natively (and possibly partially) supports IEEE float16's. This is useful
  when compiling for Intel SSE2, SSE42, AVX and AVX2, Arm NEON128 and AARCH64.

# Philosophy of NSIMD

Originally the library aimed at providing a portable zero-cost abstraction over
SIMD vendor intrinsics disregarding the underlying SIMD vector length. NSIMD
will of course continue to wrap SIMD intrinsics from various vendors but
more efforts will be put into writing NSIMD modules and improving the existing
ones especially the SPMD module. 

## The SPMD paradigm

It is our belief that SPMD is a good paradigm for writing vectorized code. It
helps both the developer and the compiler writer. It forces the developers to
better arrange its data ion memory more suited for vectorization. On the
compiler side it is more simplier to write a "SPMD compiler" than a standard
C/C++/Fortran compiler that tries to autovectorize some weird loop with data
scattered all around the place. Our priority for our SPMD module are the
following:

- Add oneAPI/SYCL support.
- Provide a richer API.
- Provide cross-lane data transfer.
- Provide a way to abstract shared memory.

Our approach can be roughly compared to ISPC (<https://ispc.github.io/>)
but from a library point of view.

## Wrapping intrinsics in NSIMD core

NSIMD was designed following as closely as possible the following guidelines:

- Correctness primes over speed except for corner cases which may include the
  following:
  + Buggy intrinsics on rare input values (denormal numbers, infinities,
    NaNs) in which case a slower but correct alternative may be
    proposed to bypass the buggy intrinsics.
  + A buggy intrinsics but for a specific version of a family of chips. It
    would be unreasonable to penalize the majority of users vs. a few (or
    even no) users.
- Emulate with tricks and intrinsic integer arithmetic when not available.
- Use common names as found in common computation libraries.
- Do not hide SIMD registers, one variable (of a type such as `nsimd::pack`)
  matches one register. When possible force the user to think different between
  SIMD code and scalar code.
- Make the life of the compiler as easy as possible: keep the code simple to
  allow the compiler to perform as many optimizations as possible.
- Favor the advanced C++ API.

You may wrap intrinsics that require compile time knowledge of the underlying
vector length but this should be done with caution.

Wrapping intrinsics that do not exist for all types is difficult and may
require casting or emulation. For instance, 8 bit integer vector multiplication
using SSE2 does not exist. We can either process each pair of integers
individually or we can cast the 8 bit vectors to 16 bit vectors, do the
multiplication and cast them back to 8 bit vectors. In the second case,
chaining operations will generate many unwanted casts.

To avoid hiding important details to the user, overloads of operators involving
scalars and SIMD vectors are not provided by default. Those can be included
explicitely to emphasize the fact that using expressions like `scalar + vector`
might incur an optimization penalty.

The use of `nsimd::pack` may not be portable to ARM SVE and therefore must be
included manually. ARM SVE registers can only be stored in sizeless strucs
(`__sizeless_struct`). This feature (as of 2019/04/05) is only supported by the
ARM compiler. We do not know whether other compilers will use the same keyword
or paradigm to support SVE intrinsics.

# Contributing to NSIMD

The wrapping of intrinsics, the writing of test and bench files are tedious and
repetitive tasks. Most of those are generated using Python scripts that can be
found in `egg`.

- Intrinsics that do not require to known the vector length can be wrapped and
  will be accepted with no problem.
- Intrinsics that do require the vector length at compile time can be wrapped
  but it is up to the maintainer to accept it.
- Use `clang-format` when writing C or C++ code.
- The `.cpp` files are written in C++98.
- The headers files must be compatible with C89 (when possible otherwise
  C99), C++98, C++11, C++14 up to and including C++20.

Please see <doc/markdown/CONTRIBUTE.md> for more details.

# LICENSES

NSIMD contains files from the excellent [Sleef library](https://sleef.org/)
whose license is stated below. The corresponding files are all located
in the `src` folder and have retained their original license notices.

## NSIMD license

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

## Sleef license ([Boost Software License v1.0](https://www.boost.org/LICENSE_1_0.txt))

Boost Software License - Version 1.0 - August 17th, 2003

Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:

The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.


================================================
FILE: benches/benches.hpp
================================================
#ifndef BENCHES_HPP
#define BENCHES_HPP

#include <limits>
#include <cmath>
#include <climits>

namespace nsimd {
namespace benches {

template <typename T>
double rand_sign() {
  if (std::is_unsigned<T>::value) {
    return 1.;
  } else {
    return (::rand() % 2) ? 1. : -1.;
  }
}

template <typename T>
T rand_bits(T min, T max = std::numeric_limits<T>::max()) {
  T r;
  do {
    int nbits = sizeof(T) * CHAR_BIT;
    u64 x = 0;
    for (int i = 0; i < nbits; ++i) {
      x |= u64(::rand() % 2) << i;
    }
    r = *((T*)&x);
  } while (r < min || r > max);
  return r;
}

template <typename T>
T rand_from(T min, T max = std::numeric_limits<T>::max()) {
  // From: http://c-faq.com/lib/randrange.html
  return T(double(min)
      + (double(::rand()) / (double(RAND_MAX) / (double(max) - double(min) + 1))));
}

template <typename T>
T rand_fp(T min, T max) {
  T r;
  if (std::isinf(min) && std::isinf(max)) {
    // For now, we're not using this method for random number
    //r = rand_bits<T>(min, max);
    r = rand_from<T>(-1000000, 1000000);
  } else {
    r = rand_from<T>(min, max);
  }
  return r;
}

template <typename T>
T rand(T min, T max = std::numeric_limits<T>::max()) {
  return rand_from<T>(min, max);
}

template <>
float rand<float>(float min, float max) {
  return rand_fp<float>(min, max);
}

template <>
double rand<double>(double min, double max) {
  return rand_fp<double>(min, max);
}

}
}

#endif


================================================
FILE: build.nsconfig
================================================
# MIT License
#
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

package_name nsimd-3.0

## ----------------------------------------------------------------------------
## Get OS/Compiler specific file extensions

set o       = @obj_ext
set exe     = @exe_ext
set s       = @asm_ext
set so      = @shared_lib_ext
set lib     = @shared_link_ext
set root    = @source_dir
set make    = @make_command
set build   = @build_dir
set root    = @source_dir
set ccomp   = @ccomp_name
set cppcomp = @cppcomp_name

## ----------------------------------------------------------------------------
## Some defaults

ifnot_set "SIMD extension to use" simd = cpu
ifnot_set "CUDA target arch flag(s) for tests" cuda_arch_flags = ""
ifnot_set "Compile the libstdc++ statically" static_libstdcpp = true
ifnot_set "Enable C++20 tests" cpp20_tests = ""

## ----------------------------------------------------------------------------
## Targets for compilation

set o_for_               = fp16$o memory$o ufp$o api_cpu$o rempitab$o \
                           sleefsp$o sleefdp$o gpu$o
set o_for_cpu            = $o_for_
set o_for_cuda           = $o_for_
set o_for_rocm           = $o_for_
set o_for_oneapi         = $o_for_
set o_for_sse2           = $o_for_cpu api_sse2$o sleef_sse2_f32$o \
                           sleef_sse2_f64$o
set o_for_sse42          = $o_for_sse2 api_sse42$o sleef_sse42_f32$o \
                           sleef_sse42_f64$o
set o_for_avx            = $o_for_sse42 api_avx$o sleef_avx_f32$o \
                           sleef_avx_f64$o
set o_for_avx2           = $o_for_avx api_avx2$o sleef_avx2_f32$o \
                           sleef_avx2_f64$o
set o_for_avx512_knl     = $o_for_avx2 api_avx512_knl$o \
                           sleef_avx512_knl_f32$o sleef_avx512_knl_f64$o
set o_for_avx512_skylake = $o_for_avx2 api_avx512_skylake$o \
                           sleef_avx512_skylake_f32$o \
                           sleef_avx512_skylake_f64$o
set o_for_neon128        = $o_for_cpu api_neon128$o sleef_neon128_f32$o \
                           sleef_neon128_f64$o
set o_for_aarch64        = $o_for_cpu api_aarch64$o sleef_aarch64_f32$o \
                           sleef_aarch64_f64$o
set o_for_sve            = $o_for_aarch64 api_sve$o sleef_sve_f32$o \
                           sleef_sve_f64$o
set o_for_sve128         = $o_for_aarch64 api_sve128$o sleef_sve128_f32$o \
                           sleef_sve128_f64$o
set o_for_sve256         = $o_for_aarch64 api_sve256$o sleef_sve256_f32$o \
                           sleef_sve256_f64$o
set o_for_sve512         = $o_for_aarch64 api_sve512$o sleef_sve512_f32$o \
                           sleef_sve512_f64$o
set o_for_sve1024        = $o_for_aarch64 api_sve1024$o sleef_sve1024_f32$o \
                           sleef_sve1024_f64$o
set o_for_sve2048        = $o_for_aarch64 api_sve2048$o sleef_sve2048_f32$o \
                           sleef_sve2048_f64$o
set o_for_vmx            = $o_for_cpu api_vmx$o sleef_vmx_f32$o sleef_vmx_f64$o
set o_for_vsx            = $o_for_vmx api_vsx$o sleef_vsx_f32$o sleef_vsx_f64$o

## ----------------------------------------------------------------------------
## SIMD compiler flags

lambda cflags_for_generic_*      = -DCPU
set    cflags_for_generic_cuda   = -DCUDA
set    cflags_for_generic_rocm   = -DROCM
set    cflags_for_generic_oneapi = -DONEAPI

set cflags_for_               = ${cflags_for_generic_$simd$}
set cflags_for_cpu            = $cflags_for_
set cflags_for_cuda           = -DCUDA
set cflags_for_rocm           = -DROCM
set cflags_for_oneapi         = -DONEAPI
set cflags_for_sse2           = -DSSE2 -msse2
set cflags_for_sse42          = -DSSE42 -msse42
set cflags_for_avx            = -DAVX -mavx
set cflags_for_avx2           = -DAVX2 -mavx2 -DFMA -mfma -DFP16 -mfp16
set cflags_for_avx512_knl     = -DAVX512_KNL -mavx512_knl -mfma -DFP16 -mfp16
set cflags_for_avx512_skylake = -DAVX512_SKYLAKE -mavx512_skylake -mfma \
                                -DFP16 -mfp16
set cflags_for_neon128        = -DNEON128 -mneon128
set cflags_for_aarch64        = -DAARCH64 -maarch64
set cflags_for_sve            = -DSVE -msve
set cflags_for_sve128         = -DSVE128 -msve128
set cflags_for_sve256         = -DSVE256 -msve256
set cflags_for_sve512         = -DSVE512 -msve512
set cflags_for_sve1024        = -DSVE1024 -msve1024
set cflags_for_sve2048        = -DSVE2048 -msve2048
set cflags_for_vmx            = -DVMX -mvmx
set cflags_for_vsx            = -DVSX -mvsx

## ----------------------------------------------------------------------------
## std default flag

lambda std_flag_for_*      = -std=c++98
set    std_flag_for_rocm   = -std=c++11
set    std_flag_for_oneapi = -std=c++17

## ----------------------------------------------------------------------------
## libstdc++ linking mode

set libstdcpp_static_link_true  = -static-libstdc++
set libstdcpp_static_link_false = 

## ----------------------------------------------------------------------------
## Some defaults

set flags        = -Wall -fPIC -O2 -I$root$/include -DNDEBUG
set cflags       = ${std_flag_for_$simd$} $flags \
                   ${libstdcpp_static_link_$static_libstdcpp$}
set sleef_cflags = -fPIC -O2 -I$root$/src -DNDEBUG -DDORENAME=1

## ----------------------------------------------------------------------------
## Default building rules

phony all deps libnsimd_$simd$$so$

build_file libnsimd_$simd$$so deps ${o_for_$simd$}
	c++ -fPIC -shared @in -o @out

set ldflags = -fPIC -L. -lnsimd_$simd

## ----------------------------------------------------------------------------
## Generic (emulation) rules for building

build_file gpu$o autodeps $root$/src/gpu.cpp
	c++ $cflags$ $cflags_for_cpu @in -c -o @out

build_file ufp$o autodeps $root$/src/ufp.cpp
	c++ $cflags$ $cflags_for_cpu @in -c -o @out

build_file fp16$o autodeps $root$/src/fp16.cpp
	c++ $cflags$ $cflags_for_cpu @in -c -o @out

build_file memory$o autodeps $root$/src/memory.cpp
	c++ $cflags$ $cflags_for_cpu @in -c -o @out

build_file rempitab$o autodeps $root$/src/rempitab.c
	cc $sleef_cflags$ -c @in -o @out

build_file sleefsp$o autodeps $root$/src/sleefsp.c
	cc $sleef_cflags$ -c @in -o @out

build_file sleefdp$o autodeps $root$/src/sleefdp.c
	cc $sleef_cflags$ -c @in -o @out

build_file api_cpu$o autodeps $root$/src/api_cpu.cpp
	c++ $cflags$ $cflags_for_cpu -c @in -o @out

## ----------------------------------------------------------------------------
## Intel rules for building

build_file api_sse2$o autodeps $root$/src/api_sse2.cpp
	c++ $cflags$ -c $cflags_for_sse2 @in -o @out

build_file sleef_sse2_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out

build_file sleef_sse2_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out

build_file api_sse42$o autodeps $root$/src/api_sse42.cpp
	c++ $cflags$ -c $cflags_for_sse42 @in -o @out

build_file sleef_sse42_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out

build_file sleef_sse42_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out

build_file api_avx$o autodeps $root$/src/api_avx.cpp
	c++ $cflags$ -c $cflags_for_avx @in -o @out

build_file sleef_avx_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out

build_file sleef_avx_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out

build_file api_avx2$o autodeps $root$/src/api_avx2.cpp
	c++ $cflags$ -c $cflags_for_avx2 @in -o @out

build_file sleef_avx2_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \
	   @in -o @out

build_file sleef_avx2_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \
	   @in -o @out

build_file api_avx512_knl$o autodeps $root$/src/api_avx512_knl.cpp
	c++ $cflags$ -c $cflags_for_avx512_knl @in -o @out

build_file sleef_avx512_knl_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \
	   -DENABLE_AVX512F=1 @in -o @out

build_file sleef_avx512_knl_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \
	   -DENABLE_AVX512F=1 @in -o @out

build_file api_avx512_skylake$o autodeps $root$/src/api_avx512_skylake.cpp
	c++ $cflags$ -c $cflags_for_avx512_skylake @in -o @out

build_file sleef_avx512_skylake_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \
	   -DENABLE_AVX512F=1 @in -o @out

build_file sleef_avx512_skylake_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \
	   -DENABLE_AVX512F=1 @in -o @out

## ----------------------------------------------------------------------------
## ARM 32 bits rules for building

build_file api_neon128$o autodeps $root$/src/api_neon128.cpp
	c++ $cflags$ -c $cflags_for_neon128 @in -o @out

build_file sleef_neon128_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 \
	   -DENABLE_NEON32=1 @in -o @out

build_file sleef_neon128_f64$o autodeps $root$/src/sleefsimddp_emulation.c
	cc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 -DENABLE_NEON32=1 \
	   -I$root$/include @in -o @out

## ----------------------------------------------------------------------------
## ARM 64 bits rules for building

build_file api_aarch64$o autodeps $root$/src/api_aarch64.cpp
	c++ $cflags$ -c $cflags_for_aarch64 @in -o @out

build_file sleef_aarch64_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \
	   -DENABLE_ADVSIMD=1 @in -o @out

build_file sleef_aarch64_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \
	   -DENABLE_ADVSIMD=1 @in -o @out

build_file api_sve$o autodeps $root$/src/api_sve.cpp
	c++ $cflags$ -c $cflags_for_sve @in -o @out

build_file sleef_sve_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out

build_file sleef_sve_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out

build_file api_sve128$o autodeps $root$/src/api_sve128.cpp
	c++ $cflags$ -c $cflags_for_sve128 @in -o @out

build_file sleef_sve128_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out

build_file sleef_sve128_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out

build_file api_sve256$o autodeps $root$/src/api_sve256.cpp
	c++ $cflags$ -c $cflags_for_sve256 @in -o @out

build_file sleef_sve256_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out

build_file sleef_sve256_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out

build_file api_sve512$o autodeps $root$/src/api_sve512.cpp
	c++ $cflags$ -c $cflags_for_sve512 @in -o @out

build_file sleef_sve512_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out

build_file sleef_sve512_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out

build_file api_sve1024$o autodeps $root$/src/api_sve1024.cpp
	c++ $cflags$ -c $cflags_for_sve1024 @in -o @out

build_file sleef_sve1024_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \
	   @in -o @out

build_file sleef_sve1024_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \
	   @in -o @out

build_file api_sve2048$o autodeps $root$/src/api_sve2048.cpp
	c++ $cflags$ -c $cflags_for_sve2048 @in -o @out

build_file sleef_sve2048_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \
	   @in -o @out

build_file sleef_sve2048_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \
	   @in -o @out

## ----------------------------------------------------------------------------
## POWERPC rules for building

build_file api_vmx$o autodeps $root$/src/api_vmx.cpp
	c++ $cflags$ -c $cflags_for_vmx @in -o @out

build_file sleef_vmx_f32$o autodeps $root$/src/sleefsimdsp_emulation.c
	cc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \
	   -I$root$/include @in -o @out

build_file sleef_vmx_f64$o autodeps $root$/src/sleefsimddp_emulation.c
	cc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \
	   -I$root$/include @in -o @out

build_file api_vsx$o autodeps $root$/src/api_vsx.cpp
	c++ $cflags$ -c $cflags_for_vsx @in -o @out

build_file sleef_vsx_f32$o autodeps $root$/src/sleefsimdsp.c
	cc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out

build_file sleef_vsx_f64$o autodeps $root$/src/sleefsimddp.c
	cc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out

## ----------------------------------------------------------------------------
## Installation and packaging

install_file libnsimd_${simd}$so lib
[W] install_file libnsimd_${simd}$lib lib
install_dir $root$/include/nsimd include
install_dir $root$/doc/html doc

## ----------------------------------------------------------------------------
## Tests

# Lambda arguments: suite, compiler, std, simd_ext
# By default all tests will be considered
lambda tests_*_*_* = ok

# Now disable some possibilities on certain compilers
set    tests_clang_c89_vmx = ""
set    tests_clang_c89_vsx = ""
set    tests_clang_c89_sve = ""
lambda tests_*_c89_cuda    = ""
lambda tests_*_c99_cuda    = ""
lambda tests_*_c11_cuda    = ""
lambda tests_*_cpp17_cuda  = ""
lambda tests_*_c89_rocm    = ""
lambda tests_*_c99_rocm    = ""
lambda tests_*_c11_rocm    = ""
lambda tests_*_cpp98_rocm  = ""
lambda tests_*_cpp17_rocm  = ""
lambda tests_*_c89_oneapi  = ""
lambda tests_*_c99_oneapi  = ""
lambda tests_*_c11_oneapi  = ""
lambda tests_dpcpp_cpp98_* = ""
lambda tests_dpcpp_cpp11_* = ""

set c89_enabled   = ${tests_$ccomp$_c89_$simd$}
set c89.files     = ""
set c99_enabled   = ${tests_$ccomp$_c99_$simd$}
set c99.files     = ""
set c11_enabled   = ${tests_$ccomp$_c11_$simd$}
set c11.files     = ""
set cpp98_enabled = ${tests_$cppcomp$_cpp98_$simd$}
set cpp98.files   = ""
set cpp11_enabled = ${tests_$cppcomp$_cpp11_$simd$}
set cpp11.files   = ""
set cpp17_enabled = ${tests_$cppcomp$_cpp17_$simd$}
set cpp17.files   = ""
set cpp20.files   = ""

set tests_flags = $cuda_arch_flags $flags ${cflags_for_$simd$} -lm $ldflags
echo Test compilation flags: $tests_flags$

[$c89_enabled$] build_files c89 foreach glob:$root$/tests/*.prec11.c \
                as tests.%r.c89$exe \
                autodeps @item libnsimd_$simd$$so$
	[$c89_enabled$] cc -std=c89 @item $tests_flags -o @out

[$c89_enabled$] phony tests.c89 deps $c89.files


[$c99_enabled$] build_files c99 foreach glob:$root$/tests/*.prec11.c \
                as tests.%r.c99$exe \
                autodeps @item libnsimd_$simd$$so$
	[$c99_enabled$] cc -std=c99 @item $tests_flags -o @out

[$c99_enabled$] phony tests.c99 deps $c99.files


[$c11_enabled$] build_files c11 foreach glob:$root$/tests/*.c \
                as tests.%r.c11$exe \
                autodeps @item libnsimd_$simd$$so$
	[$c11_enabled$] cc -std=c11 @item $tests_flags -o @out

[$c11_enabled$] phony tests.c11 deps $c11.files


[$cpp98_enabled$] build_files cpp98 foreach glob:$root$/tests/*.cpp \
                  as tests.%r.cpp98$exe \
                  autodeps @item libnsimd_$simd$$so$
	[$cpp98_enabled$] c++ -std=c++98 @item $tests_flags -o @out

[$cpp98_enabled$] phony tests.cpp98 deps $cpp98.files


[$cpp11_enabled$] build_files cpp11 foreach glob:$root$/tests/*.cpp \
                  as tests.%r.cpp11$exe \
                  autodeps @item libnsimd_$simd$$so$
	[$cpp11_enabled$] c++ -std=c++11 @item $tests_flags -o @out

[$cpp11_enabled$] phony tests.cpp11 deps $cpp11.files


[$cpp17_enabled$] build_files cpp17 foreach glob:$root$/tests/*.cpp \
                  as tests.%r.cpp17$exe \
                  autodeps @item libnsimd_$simd$$so$
	[$cpp17_enabled$] c++ -std=c++17 @item $tests_flags -o @out

[$cpp17_enabled$] phony tests.cpp17 deps $cpp17.files


[$cpp20_tests$] build_files cpp20 foreach glob:$root$/tests/*.cpp \
                  as tests.%r.cpp20$exe \
                  autodeps @item libnsimd_$simd$$so$
	[$cpp20_tests$] c++ -std=c++20 @item $tests_flags -o @out

[$cpp20_tests$] phony tests.cpp20 deps $cpp20.files


# Phony target for tests
phony tests deps $c89.files $c99.files $c11.files $cpp98.files $cpp11.files \
                 $cpp17.files $cpp20.files

## ----------------------------------------------------------------------------
## Examples

build_files examples_cpp98 foreach glob:$root$/examples/*.cpp \
                           as examples.%r.cpp98$exe \
                           autodeps @item libnsimd_$simd$$so$
	c++ -std=c++98 @item $tests_flags -o @out

phony examples.cpp98 deps $examples_cpp98.files


================================================
FILE: doc/Makefile.nix
================================================
# Copyright (c) 2020 Agenium Scale
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

NS2_ROOT  = ../nstools/ns2
CXX       = c++
CXX_FLAGS = -O2 -Wall -Wextra -pedantic -std=c++11

all: md2html what_is_wrapped

libns2.a: $(NS2_ROOT)/../.git/logs/HEAD Makefile.nix
	rm -rf libns2
	mkdir -p libns2
	cp $(NS2_ROOT)/lib/*.cpp libns2
	(cd libns2 && $(CXX) $(CXX_FLAGS) -I../$(NS2_ROOT)/include -c *.cpp)
	ar rcs $@ libns2/*.o
	rm -rf libns2

md2html: libns2.a md2html.cpp Makefile.nix
	$(CXX) $(CXX_FLAGS) md2html.cpp -I$(NS2_ROOT)/include -o $@ -L. -lns2

what_is_wrapped: libns2.a what_is_wrapped.cpp Makefile.nix
	$(CXX) $(CXX_FLAGS) what_is_wrapped.cpp -I$(NS2_ROOT)/include -o $@ \
	       -L. -lns2


================================================
FILE: doc/Makefile.win
================================================
# Copyright (c) 2020 Agenium Scale
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

NS2_ROOT  = ..\nstools\ns2
CXX       = cl
CXX_FLAGS = /nologo /Ox /W3 /EHsc /DNS_NO_DLLSPEC /D_CRT_SECURE_NO_WARNINGS

all: md2html.exe what_is_wrapped.exe

libns2.lib: $(NS2_ROOT)\..\.git\logs\HEAD Makefile.win
	if exist libns2 rd /Q /S libns2
	md libns2
	copy /Y $(NS2_ROOT)\lib\*.cpp libns2
	(cd libns2 && $(CXX) $(CXX_FLAGS) -I..\$(NS2_ROOT)\include /c *.cpp)
	lib /nologo /out:libns2.lib libns2\*.obj
	rd /Q /S libns2

md2html.exe: libns2.lib md2html.cpp Makefile.win
	$(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\include md2html.cpp libns2.lib \
	       Shlwapi.lib Dbghelp.lib /Fe$@

what_is_wrapped.exe: libns2.lib what_is_wrapped.cpp Makefile.win
	$(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\include what_is_wrapped.cpp \
	       libns2.lib Shlwapi.lib Dbghelp.lib /Fe$@


================================================
FILE: doc/markdown/compilers_and_versions.md
================================================
<!--

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-->

`nsimd` is tested with GCC, Clang and MSVC. As a C89 and a C++98 API are
provided, other compilers should work fine. Old compiler versions should work as
long as they support the targeted SIMD extension. For instance, `nsimd` can
compile on MSVC 2010 `SSE4.2` code.

`nsimd` requires a C or a C++ compiler and is actually daily tested on the
following compilers for the following hardware:

**Compiler**            | **Version** | **Architecture** | **Extensions**
----------------------- | ----------- | ---------------- | --------------
GCC                     | 8.3.0       | Intel            | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`)
Clang                   | 7.0.1       | Intel            | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`)
GCC                     | 8.3.0       | ARM              | `Aarch64`, `NEON` (`ARMv7`), `SVE`
Clang                   | 7.0.1       | ARM              | `Aarch64`, `NEON` (`ARMv7`), `SVE`
Microsoft Visual Studio | 2017        | Intel            | `SSE4.2`
Intel C++ Compiler      | 19.0.4.243  | Intel            | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`SKYLAKE`)

<!-- TODO  -->
<!--We recommend using a 64-bits compiler as this results in significantly better
performance. Also, `nsimd` performances are only provided when compiled in an
optimized code with assertions disabled.-->


================================================
FILE: doc/markdown/concepts.md
================================================
# C++20 concepts

As of C++20, concepts are available. We quote <en.cppreference.com> to
introduce concepts.

*Class templates, function templates, and non-template functions (typically
members of class templates) may be associated with a constraint, which
specifies the requirements on template arguments, which can be used to select
the most appropriate function overloads and template specializations.*

*Named sets of such requirements are called concepts. Each concept is a
predicate, evaluated at compile time, and becomes a part of the interface of a
template where it is used as a constraint*

## Concepts provided by NSIMD

All concepts provided by NSIMD comes in two forms:
- The native C++20 form in the `nsimd` namespace
- As a macro for keeping the compatibility with older versions of C++

The following tables list all concepts and is exhaustive. Native concepts are
accessible through the `nsimd` namespace. They take only one argument. Their
macro counterparts take no argument as they are meant to be used as
constraint placeholder types. When compiling for older C++ versions NSIMD
concepts macros are simply read as `typename` by the compiler.

Table for base C and C++ APIs:

| Native concept              | Macro                              | Description                                    |
|:----------------------------|:-----------------------------------|:-----------------------------------------------|
| `simd_ext_c`                | `NSIMD_CONCEPT_SIMD_EXT`           | Valid SIMD extension                           |
| `simd_value_type_c`         | `NSIMD_CONCEPT_VALUE_TYPE`         | Valid NSIMD underlying value type              |
| `simd_value_type_or_bool_c` | `NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL` | Valid NSIMD underlying value type or `bool`    |
| `alignment_c`               | `NSIMD_CONCEPT_ALIGNMENT`          | Valid NSIMD alignment `aligned` or `unaligned` |

Table for advanced C++ API:

| Native concept | Macro                    | Description                                    |
|:---------------|:-------------------------|:----------------------|
| `is_pack_c`    | `NSIMD_CONCEPT_PACK`     | Valid NSIMD pack      |
| `is_packl_c`   | `NSIMD_CONCEPT_PACKL`    | Valid NSIMD packl     |
| `is_packx1_c`  | `NSIMD_CONCEPT_PACKX1`   | Valid NSIMD packx1    |
| `is_packx2_c`  | `NSIMD_CONCEPT_PACKX2`   | Valid NSIMD packx2    |
| `is_packx3_c`  | `NSIMD_CONCEPT_PACKX3`   | Valid NSIMD packx3    |
| `is_packx4_c`  | `NSIMD_CONCEPT_PACKX4`   | Valid NSIMD packx4    |
| `any_pack_c`   | `NSIMD_CONCEPT_ANY_PACK` | Any of the above pack |

## Expressing C++20 constraints

Expressing constraints can of course be done with the `requires` keyword. But
for compatibility with older C++ versions NSIMD provides `NSIMD_REQUIRES`
which take as onyl argument the constraints.

```c++
template <typename T, typename S>
NSIMD_REQUIRES(sizeof(T) == sizeof(S))
void foo(T, S);
```

It is advised to use doubled parenthesis as coma in the constraints expression
can be interpreted as argument separators for the macro itself.

```c++
template <typename T, typename S>
NSIMD_REQUIRES((std::is_same<T, S>))
void foo(T, S);
```

Note that when expressing constraints using `nsimd::sizeof_v`'s prefer the
NSIMD definition of sizeof for the following reason: when dealing with
float16's one cannot know the underlying representation of such a type as it
is non-portable and non-standard, but NSIMD provides helper functions to
transparently deal with float16's as if they were 16-bits wide. Therefore
expressing sizeof equality should be done with `nsimd::sizeof_v`.

```c++
template <typename T, typename S>
NSIMD_REQUIRES((nsimd::sizeof_v<T> == nsimd::sizeof_v<S>))
void foo(T, S);
```


================================================
FILE: doc/markdown/defines.md
================================================
# Defines provided by NSIMD

NSIMD uses macros (not function macros) that we call defines to make choices
in its code at copmile time. Most of them can be of use to the end-user so
we list them here.

## Compiler detection

The compiler detection is automatically done by NSIMD as it is relatively
easy.

| Define              | Compiler                                          |
|---------------------|---------------------------------------------------|
| `NSIMD_IS_MSVC`     | Microsoft Visual C++                              |
| `NSIMD_IS_HIPCC`    | ROCm HIP compiler (warning, see below)            |
| `NSIMD_IS_NVCC`     | NVIDIA CUDA Compiler                              |
| `NSIMD_IS_ICC`      | Intel C++ Compiler                                |
| `NSIMD_IS_CLANG`    | Clang/LLVM                                        |
| `NSIMD_IS_GCC`      | GNU Compiler Collection                           |
| `NSIMD_IS_FCC`      | Fujitsu compiler                                  |

**Warning**: some HIP versions do not declare themselves at all so it
impossible to find out that HIP is the compiler. As HIP is based on clang,
without help NSIMD will detect Clang. It is up to the end-user to compile
with `-D__HIPCC__` for NSIMD to detect HIP.

Note that we do support the Armclang C and C++ compilers but for NSIMD there
is no need to have code different from Clang's specific code so we do no
provide a macro to detect this compiler in particular.

Note also that two of the above macros can be defined at the same time. This
happens typically when compiling for a device. For example when compiling for
NVIDIA CUDA with nvcc both `NSIMD_IS_NVCC` and `NSIMD_IS_GCC` (when the host
compiler is GCC).

## Compilation environment and contants

| Define            | Description           | Possible values                 |
|-------------------|-----------------------|---------------------------------|
| `NSIMD_C`         | C version             | 1989, 1999, 2011                |
| `NSIMD_CXX`       | C++ version           | 1998, 2011, 2014, 2017, 2020    |
| `NSIMD_WORD_SIZE` | Machine word size     | 32, 64                          |
| `NSIMD_U8_MIN`    | Minimum value for u8  | 0                               |
| `NSIMD_U8_MAX`    | Maximum value for u8  | 255                             |
| `NSIMD_I8_MIN`    | Minimum value for i8  | -128                            |
| `NSIMD_I8_MAX`    | Maximum value for i8  | 127                             |
| `NSIMD_U16_MIN`   | Minimum value for u16 | 0                               |
| `NSIMD_U16_MAX`   | Maximum value for u16 | 65535                           |
| `NSIMD_I16_MIN`   | Minimum value for i16 | -32768                          |
| `NSIMD_I16_MAX`   | Maximum value for i16 | 32767                           |
| `NSIMD_U32_MIN`   | Minimum value for u32 | 0                               |
| `NSIMD_U32_MAX`   | Maximum value for u32 | 4294967295                      |
| `NSIMD_I32_MIN`   | Minimum value for i32 | -2147483648                     |
| `NSIMD_I32_MAX`   | Maximum value for i32 | 2147483647                      |
| `NSIMD_U64_MIN`   | Minimum value for u64 | 0                               |
| `NSIMD_U64_MAX`   | Maximum value for u64 | 18446744073709551615            |
| `NSIMD_I64_MIN`   | Minimum value for i64 | -9223372036854775808            |
| `NSIMD_I64_MAX`   | Maximum value for i64 | 9223372036854775807             |
| `NSIMD_DLLSPEC`   | (Windows) DLL storage-class information | `__declspec(dllexport)` or `__declspec(dllimport)` |
| `NSIMD_DLLSPEC`   | (Unix) storage-class information        | `extern` or nothing |
| `NSIMD_C_LINKAGE_FOR_F16` | Indicate whether functions involving f16 have C linkage | defined or not |

## Targeted architecture detection

Contrary to the compiler detection, the targeted architecture is not done
autoamtically by NSIMD as is really hard and some compilers do not provide
the necessary informations. So in order to have a consistent way of targeting
an architecture this is up to the end-user to specify it using one of the
following defines.

| Define                 | Targeted architecture                             |
|------------------------|---------------------------------------------------|
| `NSIMD_CPU`            | Generic, no SIMD, emulation                       |
| `NSIMD_SSE2`           | Intel SSE2                                        |
| `NSIMD_SSE42`          | Intel SSE4.2                                      |
| `NSIMD_AVX`            | Intel AVX                                         |
| `NSIMD_AVX2`           | Intel AVX2                                        |
| `NSIMD_AVX512_KNL`     | Intel AVX-512 as found on KNLs                    |
| `NSIMD_AVX512_SKYLAKE` | Intel AVX-512 as found on Xeon Skylake            |
| `NSIMD_NEON128`        | Arm NEON 128 bits as found on 32-bits Arm chips   |
| `NSIMD_AARCH64`        | Arm NEON 128 bits as found on 64-bits Arm chips   |
| `NSIMD_SVE`            | Arm SVE (length agnostic)                         |
| `NSIMD_SVE128`         | Arm SVE (size known at compilation to 128 bits)   |
| `NSIMD_SVE256`         | Arm SVE (size known at compilation to 256 bits)   |
| `NSIMD_SVE512`         | Arm SVE (size known at compilation to 512 bits)   |
| `NSIMD_SVE1024`        | Arm SVE (size known at compilation to 1024 bits)  |
| `NSIMD_SVE2048`        | Arm SVE (size known at compilation to 2048 bits)  |
| `NSIMD_CUDA`           | Nvidia CUDA                                       |
| `NSIMD_ROCM`           | AMD ROCm architectures                            |
| `NSIMD_VMX`            | IBM POWERPC VMX (Altivec)                         |
| `NSIMD_VSX`            | IBM POWERPC VSX (Altivec)                         |
| `NSIMD_FP16`           | Architecture supports natively IEEE float16       |
| `NSIMD_FMA`            | Architecture supports natively FMAs               |

## Targeted architecture constants

| Define                | Description                                        |
|-----------------------|----------------------------------------------------|
| `NSIMD_NB_REGISTERS`  | Number of SIMD registers                           |
| `NSIMD_MAX_LEN_BIT`   | Maximum number of bits in a SIMD register          |
| `NSIMD_MAX_LEN_i8`    | Maximum number of i8's in a SIMD register          |
| `NSIMD_MAX_LEN_u8`    | Maximum number of u8's in a SIMD register          |
| `NSIMD_MAX_LEN_i16`   | Maximum number of i16's in a SIMD register         |
| `NSIMD_MAX_LEN_u16`   | Maximum number of u16's in a SIMD register         |
| `NSIMD_MAX_LEN_i32`   | Maximum number of i32's in a SIMD register         |
| `NSIMD_MAX_LEN_u32`   | Maximum number of u32's in a SIMD register         |
| `NSIMD_MAX_LEN_i64`   | Maximum number of i64's in a SIMD register         |
| `NSIMD_MAX_LEN_u64`   | Maximum number of u64's in a SIMD register         |

NSIMD provides a mean to write generic code by using the `NSIMD_MAX_LEN` macros
whose argument is one of { i8, u8, i16, u16, i32, u32, i64, u64 }.

```c++
#define T ??? // to be defined as a base type

int main(void) {
  T buf[NSIMD_MAX_LEN(T)]; // an array of T's for loading/storing
  ...
  return 0;
}
```

## Other useful macros

NSIMD provides macros to concatenate blobs so that generic programming in pure
C is possible.

- `#define NSIMD_PP_CAT_2(a, b)` concatenates `a` and `b`.
- `#define NSIMD_PP_CAT_3(a, b, c)` concatenates `a`, `b` and `c`.
- `#define NSIMD_PP_CAT_4(a, b, c, d)` concatenates `a`, `b`, `c` and `d`.
- `#define NSIMD_PP_CAT_5(a, b, c, d, e)` concatenates `a`, `b`, `c`, `d` and
  `e`.
- `#define NSIMD_PP_CAT_6(a, b, c, d, e, f)` concatenates `a`, `b`, `c`, `d`,
  `e` and `f`.


================================================
FILE: doc/markdown/faq.md
================================================
<!--

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-->

# Frequently Asked Questions

## Is it good practice to use a `nsimd::pack` as a `std::vector`?

No, these are two very different objects. A `nsimd::pack` represent a SIMD
register whereas a `std::vector` represents a chunk of memory. You should
separate concerns and use `std::vector` to store data in your structs or
classes, `nsimd::pack` should only be used in computation kernels and nowhere
else especially not in structs or classes.

## Why is the speed-up of my code not as expected?

There are several reasons which can reduce the speed-up:

- Have you enabled compiler optimizations? You must enable all compiler
  optimizations (like `-O3`).

- Have you compiled in 64 bit mode? There is significant performance increase
  on architectures supporting 64 bit binaries.

- Is your code trivially vectorizable? Modern compilers can vectorize trivial
  code segments automatically. If you benchmark a trivial scalar code versus a
  vectorized code, the compiler may vectorize the scalar code, thereby giving
  similar performance to the vectorized version.

- Some architectures do not provides certains functionnalities. For example
  AVX2 chips do not provide a way to convert long to double. So using
  `nsimd::cvt<f64>` will produce an emulation for-loop in the resulting
  binary. To know which intrinsics are used by NSIMD you can consult
  <wrapped_intrinsics.md>.

## Why did my code segfaulted or crashed?

The most common cause of segfaults in SIMD codes is accessing non-aligned
memory. For best performance, all memory should be aligned. NSIMD includes an
aligned memory allocation function and an aligned memory allocator to help you
with this. Please refer to <tutorials.md> for details on how to
ensure that you memory is correctly aligned.

Another common cause is to read or write data beyond the allocated memory.
Do not forget that loading data into a SIMD vector will result in loading
16 bytes (or 4 floats) from memory. If this read occurs at the last 2 elements
of allocated memory then a segfault will be generated.

## My code compiled for AVX is not twice as fast as for SSE, why?

Not all SSE instructions have an equivalent AVX instruction. As a consequence
NSIMD uses two SSE operations to emulate the equivalent AVX operation.  Also,
the cycles required for certain instructions are not equal on both
architectures, for example, `sqrt` on `SSE` requires 13-14 cycles whereas
`sqrt` on `AVX` requires 21-28 cycles. Please refer
[here](https://www.agner.org/optimize/instruction_tables.pdf) for more
information.

Very few integer operations are supported on AVX, AVX2 is required for most
integer operations. If a NSIMD function is called on an integer AVX register,
this register will be split into two SSE registers and the equivalent
instruction called on both register. In the case, no speed-up will be observed
compared with SSE code. This is true also on POWER 7, where double is not
supported.

## I disassembled my code, and the generated code is less than optimal, why?

- Have you compiled in release mode, with full optimizations options?

- Have you used a 64 bit compiler?

- There are many SIMD related bugs across all compilers, and some compilers
  generate less than optimal code in some cases. Is it possible to update your
  compiler to a more modern compiler?

- We provide workarounds for several compiler bugs, however, we may have
  missed some. You may also have found a bug in `nsimd`. Please report this
  through issues on our github with a minimal code example. We responds quickly
  to bug reports and do our best to patch them as quickly as possible.

## How can I use a certain intrinsic?

If you require a certain intrinsic, you may search inside of NSIMD for it and
then call the relevant function or look at <wrapped_intrinsics.md>.

In rare cases, the intrinsic may not be included in NSIMD as we map the
intrinsic wherever it makes sense semantically. If a certain intrinsic does not
fit inside of this model, if may be excluded. In this case, you may call it
yourself, however, note this will not be portable. 

To use a particular intrinsic say `_mm_avg_epu8`, you can write the following.

```c++
nsimd::pack<u8> a, b, result;
result = nsimd::pack<u8>(_mm_avg_epu8(a.native_register(),
                                      b.native_register()));
```

## How do I convert integers/floats to/from logicals?

Use [`nsimd::to_mask`](api_to-mask.md) and
[`nsimd::to_logical`](api_to-logical.md).

## How about shuffles?

General shuffles are not provided by NSIMD. You can see
[issue 8 on github](https://github.com/agenium-scale/nsimd/issues/8). For now
we provide only some length agnostic shuffles such as zip and unzip, see
[the shuffle API](api.md) at the Shuffle section.

## Are there C++ STL like algorithms?

No. You are welcome to [contribute](contribute.md) to NSIMD and add them as
a NSIMD module. You should use
[expressions templates](module_tet1d_overview.md) instead. Strictly conforment
STL algorithms do not provide means to control for example the unroll factor
or the number of threads per block when compiling for GPUs.

## Are there masked operators in NSIMD?

Yes, we provide masked loads and stores, see [the api](api.md) at the
"Loads & stores" section. We also provide the
[`nsimd::mask_for_loop_tail`](api_mask-for-loop-tail.md) which computes the
mask for ending loops. But note that using these is not recommanded as on
most architectures there are no intrinsic. This will result in slow code. It
is recommanded to finish loops using a scalar implementation.

## Are there gathers and scatter in NSIMD?

Yes, we provide gathers and scatters, see [the api](api.md) at the
"Loads & stores" section. Note also that as most architectures do not provide
such intrinsics and so this could result in slow code.

## Why does not NSIMD recognize the target architecture automatically?

Autodetecting the SIMD extension is compiler/compiler version/cpu/system
dependant which means a lot of code for a (most likely buggy) feature which can
be an inconvenience sometimes. Plus some compilers do not permit this feature.
For example cf.
<https://www.boost.org/doc/libs/1_71_0/doc/html/predef/reference.html> and
<https://msdn.microsoft.com/en-us/library/b0084kay.aspx>. Thus a "manual"
system is always necessary.

## Why some operators have their names ending with an "1"?

This is because of C++ and our will not to use C++-useless-complicated stuff.
Taking the example with `if_else`, suppose that we have called it "if\_else"
without the "1". When working with packs, one wants to be able to use `if_else`
in this manner:

```c++
int main() {
  using namespace nsimd;
  
  typedef pack<int> pi;
  typedef pack<float> pf;

  int n;
  int *a, *b;      // suppose both points to n ints
  float *fa, *fb;  // suppose both points to n floats

  for (int i = 0; i < n; i += len()) {
    packl<int> cond = (loada<pi>(&a[i]) < loada<pi>(&b[i]));
    storea(&fb[i], if_else(cond, load<pf>(&fb[i]), set1<pf>(0.0f)));
  }

  return 0;
}
```

But this causes a compiler error, the overload of `if_else` is ambiguous.
Sure one can use many C++-ish techniques to tackle this problem but we chose
not to as the goal is to make the life of the compiler as easy as possible.
So as we want to favor the C++ advanced API as it is the most human readable,
users of the C and C++ base APIs will have to use `if_else1`.


================================================
FILE: doc/markdown/fp16.md
================================================
# IEEE float16 related functions

NSIMD natively supports IEEE float16's. This means that NSIMD provides types
and functions to deal with them. When the targeted architecture supports them
then NSIMD will use approriate intrinsics otherwise emulation with float32's
will be used.

- When emulating, as float16's are not natively supported by neither C or C++
  emulation is done with float32's.

- Intel architectures do not support IEEE float16 arithmetic, they only
  provide, as an extension, supports for convertion to/from float32. When
  compiling NSIMD for Intel architectures use `-DFP16` to activate the
  conversion intrinsics if available on your machine. Note that AVX-512
  has thoses natively.

- Arm architectures can provide native float16 arithmetic. For 32-bits and
  64-bits chips (ARMv7 and Aarch64) chips float16 support is optional. When
  compiling with `-DFP16`, NSIMD will use float16-related intrinsics. Note
  that for SVE chips float16's are mandatory hence NSIMD will use appropriate
  intrinsics with or without `-DFP16`.

- CUDA provides supports for converting float16's to/from float32's. These
  are always used by NSIMD. But it is only since devices of compute
  capabilities 5.3 and above that float16's arithmetic is provided. NSIMD will
  always use CUDA float16's functions so there is no need to compile with
  `-DFP16`.

- ROCm HIP supports float16's except for the first versions. For now NSIMD
  assumes that it is always the case and use HIP float16 API. There is no
  need for `-DFP16`.

## Float16's related functions and types

NSIMD provide the `f16` type which represents a IEEE float16. Note that
depending on the targeted architecture and the presence of `-DFP16` the float16
type can typedefs many different types. Therefore the two following functions
are provided and can be used to convert a float16 from/to a float32. These
functions preserve NaN's and infinities. When converting from a float32 to
a float16 saturation to infinities is performed when the float32 cannot be
represented as a float16.

| Function signature                                | Availability |
|---------------------------------------------------|--------------|
| `f16 nsimd_f32_to_f16(f32 a);`                    | C and C++    |
| `f32 nsimd_f16_to_f32(f16 a);`                    | C and C++    |
| `f16 nsimd::f32_to_f16(f32 a);`                   | C++ only     |
| `f32 nsimd::f16_to_f32(f16 a);`                   | C++ only     |

For loading/storing float16's NSIMD provides other conversion function to/from
16-bits unsigned integers. The integers will hold the IEEE binary
representation of the float16's.

| Function signature                                | Availability |
|---------------------------------------------------|--------------|
| `u16 nsimd_f32_to_u16(f32 a);`                    | C and C++    |
| `f32 nsimd_u16_to_f32(u16 a);`                    | C and C++    |
| `u16 nsimd::f32_to_u16(f32 a);`                   | C++ only     |
| `f32 nsimd::u16_to_f32(u16 a);`                   | C++ only     |

The `nsimd_*` functions listed above do not use the same linkage type depending
on the targeted architecture. When compiling for GPUs the corresponding symbols
names are mangled. They use C++ ABI because the float16 type is defined as a
C++ class and not as a C struct. We therefore inherit from the implementation
of CUDA and HIP/ROCm. Linkage types are listed below.

| Function signature                | CUDA/ROCm   | Other architectures |
|-----------------------------------|-------------|---------------------|
| `f16 nsimd_f32_to_f16(f32 a);`    | C++ linkage | C linkage           |
| `f32 nsimd_f16_to_f32(f16 a);`    | C++ linkage | C linkage           |
| `f16 nsimd::f32_to_f16(f32 a);`   | C++ linkage | C++ linkage         |
| `f32 nsimd::f16_to_f32(f16 a);`   | C++ linkage | C++ linkage         |
| `u16 nsimd_f32_to_u16(f32 a);`    | C++ linkage | C linkage           |
| `f32 nsimd_u16_to_f32(u16 a);`    | C++ linkage | C linkage           |
| `u16 nsimd::f32_to_u16(f32 a);`   | C++ linkage | C++ linkage         |
| `f32 nsimd::u16_to_f32(u16 a);`   | C++ linkage | C++ linkage         |

It is possible to know at compile time in which situation we are. The
`NSIMD_C_LINKAGE_FOR_F16` macro if defined means that C linkage is used for
`nsimd_*` functions.


================================================
FILE: doc/markdown/how_tests_are_done.md
================================================
<!--

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-->

# How tests are done?

First and foremost note that this is a work in progress and that we are doing
our best to have serious testing of the library.

We can also state our conclusion on testing: we are not and never will be
satisfied with our tests, there are not enough of them, we want more.

The current system has on average 15000 tests by SIMD extensions. Thanks to
our "Python" approach we can automatically generate tests for all operators
and for all types. This has greatly helped us in finding bugs. But, as you
know, bugs are always there.

## Why write this?

Testing the library has been taken seriously since its very beginning. Tests
have gone through several stages:

- The first one was during the development of the first version of the library.
  Tests of operators were done with random numbers as input. Those random
  numbers were all powers of 2 to ease the comparisons of basic arithmetic
  types. NaNs and infinities were not generated as inputs and operators
  behaviors with those inputs were not tested

- For the second stage random numbers generators have been improved to emit
  NaNs and infinities. It allowed us to detect many errors in operators,
  mostly in math functions like cos, sin, exp... But we also discovered bugs
  in hardware when NaNs and infinities are given to intrinsics.

- The third stage which the current test system takes into account the
  experience we gain with the privous two. As we have abandonned the buggy and
  slow implementations of math functions coming from Boost.SIMD and now rely on
  the excellent Sleef (<https://sleef.org/>) we trust that the math functions
  are correctly tested. In more details we do not generate NaNs and infinities
  anymore because we trust functions coming from Sleef and we do not want
  to write code in our tests to bypass hardware bugs. We only care that our
  wrapping are correct adn that `nsimd::add` correctly calls add, the fact that
  the add does not work correctly is a hardware bug then and not the
  problem of the library.

Tests on floatting points are done using ULPs. ULP means units in the last
place and is commonly used for the comparison of floatting point numbers.
It is in general a bad idea to compare floats with the `==` operators as
it essentially compares bits. Instead we want to check if the results of
two computations are "not to far away from each other". When checking an
operator, let's say, on CPUs and GPUs, we to take into account that
- the rounding mode may be different and
- the precision of the calculation may be different.

## ULPs

This chapter is dedicated to math proof concerning ULPs. Indeed people use
this notion but proofs are hard to find. We give our own definition of distance
in ULPs, compare it to the usual one and give pros and cons.
We assume the reader is familiar with basic mathematics.

For this entire chapter fix the following:
- an integer $b > 1$ (will be our radix),
- an integer $p > 1$ (will be the number of digits in the mantissa)
- an integer $M > 1$ (will be the minimum exponent allowed for floatting
  point numbers)
A floatting point number is an element of $\mathbb{R}$ of the form
$m b^e$ with $e \geq -M$ and $m \in \mathbb{Z}$. More precisely we define
the set of floatting point numbers $F$ to be the union of the following two
sets:
- $\{ mb^e \in F \text{ with } e > -M \}$ the *normal* numbers.
- $\{ mb^{-M} \in F \text{ with } m \in \mathbb{Z} \text{ and }
  0 < |m| < b^p \}$ the *denormal* or *subnormal* numbers.

The set $F$ can be viewed as a subset of $\mathbb{R}$ with the mapping
$\phi : (m, e) \mapsto mb^e$ and we will make this abuse of
notation in what follows. Usually the sign of the floatting point number
is separated from $m$ but we include it "inside" $m$ as it does not change
the proofs below and simplifies the notations.

Let $a_i \in F$ for $i = 1,2$ such that $a_i = m_i b^{e_i}$.

**Proposition:** $\phi$ is injective.

**Proof:** Suppose that $a_1 = a_2$ or $m_1b^{e_1} = m_2b^{e_2}$. If $a_1$
and $a_2$ are subnormal numbers then $e_1 = e_2 = -M$ and $m_1 = m_2$. If
$a_1$ and $a_2$ are normal numbers suppose that $e_2 > e_1$, then
$|\frac{m_2b^{e_2}}{m_1b^{e_1}}| > b^{e_2 + p - 1 - e_1 - p}
= b^{e_2 - e_1 - 1} \geq b^{1 - 1} = 1$ therefore
$m_2b^{e_2} \neq m_1b^{e_1}$ which is absurd hence $e_1 = e_2$ and as a
consequence $m_1 = m_2$.

**Definition:** We define the *distance in ULPs between $a_1$ and $a_2$*
denoted by $U(a_1, a_2)$ to be:
- $|m_1b^{e_1 - e_2} - m_2|$ if $e_1 \geq e_2$,
- $|m_1 - m_2b^{e_2 - e_1}|$ otherwise.

**Example:** Take $a_1 = 123456 \times 10^5$ and $a_2 = 123789 \times 10^5$
Then as the exponents of $a_1$ and $a_2$ are the same we have
$U(123456 \times 10^5, 123789 \times 10^5) = |123789 - 123456| = 333$.

The following proposition confort the name "units in the last place".

**Proposition:** Let $f = \lfloor \log_b U(a_1, a_2) \rfloor + 1$ and suppose
that $a_1, a_2$ are of same sign and have the same exponents, then either the
first $p - f$ digits of $m_1$ and $m_2$ are identical or their difference is
$\pm 1$.

**Proof:** For $i = 1,2$ there exists $q_i \in \mathbb{Z}$ and
$0 \leq r_i < b^f$ such that $m_i = q_i b^f + r_i$. Then
$|q_1 - q_2| \leq \frac{|m_1 - m_2| + |r_1 - r_2|}{b^f}
< \frac{b^{\log_b(U(a_1, a_2)} + b^f}{b^f} = 2$

So that either $q_1 = q_2$ or $q_1 - q_2 = \pm 1$. It is interesting to know
what are the cases when $q_1 - q_2 \pm 1$. Suppose that $0 \leq m_1 < m_2$
and that $q_1 = q_2 + 1$ then $m_1 = q_1 b^f  + r_1 \geq q_2 b^f + b^f >
q_2 b^f + r_2 = m_2$ which contradicts the hypothesis hence $q_1 \leq q_2$.
Finally $r_1 + U(a_1, a_2) = r_1 + (m_2 - m_1) = q_2 b^f + r_2 - q_1 b^f
= r_2 + b_f$ so that:
- $r_1 + U(a_1, a_2) \geq b^f$ and
- $r_1 = r_2 + (b_f - U(a_1, a_2)) = r_2 + (b^f - b^{\log_b(U(a_1, a_2))})
  > r_2$.

**Example:** Taking back $a_1 = 123456 \times 10^5$ and
$a_2 = 123789 \times 10^5$. As $q_1 = q_2$ we have the first 3 digits of $a_1$
and $a_2$ that are identical and they differ by their last
$\log_{10} \lfloor U(a_1, a_2) \rfloor + 1
= \lfloor \log_{10}(333) \rfloor + 1 = 3$

**Example:** Now take $a_1 = 899900 \times 10^5$ and
$a_2 = 900100 \times 10^5$. We have $f = 3$ but $q_2 = q_1 + 1$ and
$r_2 = 900 > 100 = r_1$ and $r_2 + U(a_1, a_2) = 1100 \geq 1000 = 10^3$.

The propositions above show that our definition of the ULP distance is well
choosen as we have the following results:
- (second proposition) is measures de number of different digits at the end
  of the mantissa.
- (first proposition) if we write the numbers differently but still in base $b$
  we only change the number of different digits in the last places by some
  zeros. The latter number being the exponent of $b$ that represents the
  difference in scaling of both representations of floatting point numbers.

We show now how to compute it using the IEEE 754 floatting point numbers
representation. A floatting point number $(m, e) \in F$ is stored in memory
(and registers) as the integer $\pm ((e + M)b^p + |m|)$.

**Proposition:** If $e_2 \geq e_1 + 2$ then $U(a_1, a_2) \geq b^p$.

**Proof:** We have $U(a_1, a_2) = |m_2 b^{e_2 - e_1} - m_1|
\geq ||m_2| b^{e_2 - e_1} - |m_1||$. But $m_2$ is a normal number otherwise we
would have $e_2 = -M = e_1$ so that $|m_2| \geq b^{p - 1}$ and we have
$|m_2| b^{e_2 - e_1} \geq b^{p - 1 + e_2 - e_1} \geq b^{p + 1} > |m_1|$,
therefore $||m_2| b^{e_2 - e_1} - |m_1|| \geq |m_2|b^2 - |m_1|
> b^{p - 1 + 2} - b^p = b^p$.

The proposition above basically states that if two floatting point numbers
are two orders of magnitude away then that have no digits in common, and
that there are godd chances that comparing them is not interesting at all.

The usual definition of the distance in ULPs is roughly given as the number
of floatting point numbers between the two considered floatting point numbers.
More precisely we will denote it by $V$ and it is defined as follows:
- $V(a_1, a_2) = |(e_1 + M)b^p + |m_1| - (e_2 + M)b^p - |m_2||$ if $a_1$ and
  $a_2$ have the same signs
- $V(a_1, a_2) = (e_1 + M)b^p + |m_1| + (e_2 + M)b^p + |m_2|$ otherwise.

**Proposition:** If $e_1 = e_2$ and $a_1$, $a_2$ have the same sign then
$U(a_1, a_2) = V(a_1, a_2)$.

**Proof:** We have $V(a_1, a_2) = |(e_1 + M)b^p + m_1 - (e_2 + M)b^p - m_2|$,
but as $e_1 = e_2$, we end up with $V(a_1, a_2) = |m_1 - m_2| = U(a_1, a_2)$.

**Proposition:** $V(a_1, a_2) = 1$ is equivalent to $U(a_1, a_2) = 1$.

**Proof:** The proposition is true if $e_1 = e_2$. Suppose that $e_2 > e_1$.
Note that $a_2$ is a normal number so that $m_2 \geq b^{p - 1}$.

We first suppose that $V(a_1, a_2) = 1$. Then by the definition of $V$, $a_1$
and $a_2$ have same sign otherwise $V(a_1, a_2) \geq 2$ and we suppose that
$a_i \geq 0$. Moreover we have $e_2 = e_1 + 1$ otherwise we would have that
$a_1 = m_1b^{e_1} < m_1b^{e_1 + 1} < m_2b^{e_1 + 2} \leq a_2$. Now we have
$(b^p - 1)b^{e_1} < b^{p - 1}b^{e_1 + 1}$ and let
$(b^p - 1)b^{e_1} \leq mb^e \leq b^{p - 1}b^{e_1 + 1}$.

First note that if $a = mb^e$ is a normal number then $m \geq b^{p - 1}$ and if
$a$ is a subnormal number then $e = -M$ in which case we also have $e_1 = -M$
and $m \geq b^p - 1 \geq b^{p - 1}$. In any case $m \geq b^{p - 1}$.

We have $(b^p - 1)/m b^{e_1} < b^e < b^{p - 1}/m b^{e_1 + 1}$. But
$1 \leq (b^p - 1) / m$ and $b^{p - 1} / m \leq 1$ so that
$b^{e_1} \leq b^e \leq b^{e_1 + 1}$ and $e = e_1$ or $e = e_1 + 1$. In the
first case $(b^p - 1)b^{e_1} \leq mb^{e_1}$ so that $b^p - 1 \leq m$ but
$m < b^p$ and $m = b^p - 1$. In the second case
$mb^{e_1 + 1} \leq b^{p - 1}b^{e_1 + 1}$ so that $m \leq b^{p - 1}$ but
$b^{p - 1} \leq m$ and $m = b^{p - 1}$. We have proven that two consecutive
elements of $F$ with $e_2 = e_1 + 1$ are neessary of the form
$a_1 = (b^p - 1)b^{e_1}$ and $a_2 = b^{p - 1}b^{e_1 + 1}$. Now we can compute
$U(a_1, a_2) = |bb^{p - 1} - (b^p - 1)| = 1$.

Conversely, suppose that $U(a_1, a_2) = 1$, then
$|b^{e_2 - e_1}m_2 - m_1| = 1$. Suppose that $b^{e_2 - e_1}m_2 - m_1 = -1$,
then $-1 \geq bb^{p - 1} - b^p = 0$ which is absurd. We then have
$b^{e_2 - e_1}m_2 - m_1 = 1$. Suppose that $e_2 \geq e_1 + 2$ then we would
have that $b^{e_2 - e_1}m_2 - m_1 \geq b^2b^{p - 1} - b^p \geq b^p$ which is
absurd so that $e_2 = e_1 + 1$ and $bm_2 - m_1 = 1$. Suppose that
$m_2 \geq b^{p - 1} + 1$ then $bm_2 - m_1 \geq b^p + b - (b^p - 1) \geq 2$
which is absurd so that $m_2 = b^{p - 1}$ and as a consequence $m_1 = b^p - 1$.

If $a_1, a_2 < 0$, then $V(a_1, a_2) = 1$ is equivalent by definition to
$V(-a_1, -a_2) = 1$ which is equivalent to $U(-a_1, -a_2) = 1$ which is
by definition equivalent to $U(a_1, a_2) = 1$.

**Proposition:** Suppose that $e_1 \leq e_2 \leq e_1 + 1$ then
$V \leq U \leq bV$.

**Proof:** The proposition is true if $e_1 = e_2$. Suppose now that
$e_2 = e_1 + 1$. Then we have
$b^p + m_2 - m_1 \geq b^p + b^{p - 1} - b^p \geq 0$
so that $V(a_1, a_2) = b^p + m_2 - m_1 = b^p + m_2(1 - b) + bm_2 - m_1$. But
$b^p + m_2(1 - b) \leq b^p + b^p(1 - b) \leq 0$ and
$bm_2 - m_1 \geq bb^{p - 1} - b^p = 0$ so that $V(a_1, a_2) \leq bm_2 - m_1
= U(a_1, a_2)$. On the other hand we have $bm_2 - m_1
\leq b(b^p + m_2 - m_1 + m_1 - m_1/b - b^p)$ but
$m_1 - m_1/b - b^p \leq b^p - b^{p - 1}/b - b^p \leq 0$ so that
$U(a_1, a_2) \leq b(b^p + m_2 - m_1) = bV(a_1, a_2)$.

**Remark:** The previous propositions shows that the difference between $V$
and $U$ is only visible when the arguments have differents exponents and
are non consecutive. Our version of the distance in ULPs puts more weights
when crossing powers of $b$. Also if $e_2 \geq e_1 + 2$ then we have seen that
$a_1$ and $a_2$ have nothing in common which is indicated by the fact that
$U, V \geq b^p$.

**Definition:** We now define the relative distance $D(a_1, a_2)$ between
$a_1$ and $a_2$ to be $|a_1 - a_2| / \min(|a_1|, |a_2|)$.

**Proposition:** As $U$ is defined in a "mathematical" way compared to $V$ then
the relation between $U$ and $D$ is straightforward and we have
$D(a_1, a_2) = U(a_1, a_2) / |m_1|$. Moreover we have
$b^{-q}U \leq D \leq b^{1 - q}U$ where $q$ is the greatest integer such that
$b^{q - 1} \leq |m_1| < b^q$. In particular if $a_1$ is a normal number then
$p = q$.

**Proof:** Suppose that $|a_1| < |a_2|$, then we have three cases:
- If $a_2$ is denormal, then so is $a_1$ and $e_1 = -M = e_2$.
- If $a_2$ is normal, then:
  + If $a_1$ is denormal then $e_1 < e_2$.
  + If $a_1$ and $a_2$ are normal numbers then $|m_1/m_2| b^{e_1 - e_2} < 1$
    but $|m_1/m_2| \geq b^{p - 1} / b^p = b^{-1}$ and we have
    $b^{e_1 - e_2 - 1} < 1$ so that $e_1 < e_2 + 1$ or $e_1 \leq e_2$.
In any case we have $e_1 \leq e_2$, as a consequence we have
$D(a_1, a_2) = |m_1b^{e_1} - m_2b^{e_2}| / \min(|m_1|b^{e_1}, |m_2|b^{e_2})
= |m_1 - m_2b^{e_2 - e_1}| / \min(|m_1|, |m_2|b^{e_2 - e_1})$. Therefore
$D(a_1, a_2) = U(a_1, a_2) / \min(|m_1|, |m_2|b^{e_2 - e_1})$. Now if
$e_1 = e_2$ then $\min(|m_1|, |m_2|) = |m_1|$ but if $e_2 > e_1$ then $a_2$ is
a normal number and $|m_1| < b^p = b \times b^{p - 1} \leq b^{e_2 - e_1} |m_2|$
and again $\min(|m_1|, |m_2|b^{e_2 - e_1}) = |m_1|$.

Applying $b^{q - 1} \leq |m_1| < b^q$ we get that
$b^{-q}U \leq D \leq b^{1 - q}U$. If moreover $a_1$ is a normal number then
by definition $p = q$.

**Remark:** Using the inequality of the previous proposition and taking the
base-$b$ logarithm we get $-q + \log U \leq \log D \leq 1 - q + \log U$ and
then $-q + \lfloor \log U \rfloor \leq \lfloor \log D \rfloor
\leq 1 - q + \lfloor \log U \rfloor$ hence two possibilities:
- $-q + \lfloor \log U \rfloor = \lfloor \log D \rfloor$ in which case
  $\lfloor \log U \rfloor + (-\lfloor \log D \rfloor) = q$.
- $1 - q + \lfloor \log U \rfloor = \lfloor \log D \rfloor$ in which case
  $1 + \lfloor \log U \rfloor + (-\lfloor \log D \rfloor) = q$.
According to a above proposition we know that $f = 1 + \lfloor \log U \rfloor$
can be interpreted as the number of differents digits in the last places of the
mantissa. Write $\mathcal{D} = - \lfloor \log D \rfloor$ then
$q \leq f + \mathcal{D} \leq q + 1$. The latter inequality shows that
$\mathcal{D}$ can be interpreted as the number of digits which are the same in
the mantissa near the "first" place. Note that for denormal numbers the "first"
places are near the bit of most significance. We can conclude this remark with
the interpretation that two floatting point numbers have at least
$\mathcal{D} - 1$ digits in common in the first place of the mantissa and $f$
digits which are different in the last place of the mantissa.

**Algorithm:** We give below the C code for $U$ with a caveat. As seen in a
previous proposition when $e_2 \geq e_1 + 2$ the arguments have no digit in
common and can be considered too far away in which case we return `INT_MAX` (or
`LONG_MAX`). As a side effect is that the code will be free of multiprecision
integers (which would be necessary as soon as $|e_2 - e_1| \geq 12$) hence
lesser dependencies, readability, maintainability and performances.
When $|e_2 - e_1| \leq 1$ we use the formula of the definition.

```c
/* We suppose that floats are IEEE754 and not NaN nor infinity */

struct fl_t{
  int mantissa;
  int exponent;
};

fl_t decompose(float a_) {
  fl_t ret;
  unsigned int a;
  memcpy(&a, &a_, sizeof(float)); /* avoid aliasing */
  ret.exponent = (int)((a >> 23) & 0xff) - 127;
  if (ret.exponent == -127) {
    /* denormal number */
    ret.mantissa = (int)(a & 0x007fffff);
  } else {
    ret.mantissa = (int)((1 << 23) | (a & 0x007fffff));
  }
  if (a >> 31) {
    ret.mantissa = -ret.mantissa;
  }
  return ret;
}

int distance_ulps(float a_, float b_) {
  fl_t a, b;
  a = decompose(a_);
  b = decompose(b_);

  if (a.exponent - b.exponent < -1 || a.exponent - b.exponent > 1) {
    return INT_MAX;
  }
  
  int d;
  if (a.exponent == b.exponent) {
    d = a.mantissa = b.mantissa;
  } else if (a.exponent > b.exponent) {
    d = 2 * a.mantissa - b.mantissa;
  } else {
    d = 2 * b.mantissa - a.mantissa;
  }

  return d > 0 ? d : -d;
}
```

The algorithm for computing $\mathcal{D} - 1$ follows:

```c
int d(float a_, float b_) {
  float absa = fabsf(a_);
  float absb = fabsf(b_);

  /* ensure that |a_| <= |b_| */
  if (absb < absa) {
    float tmp = absa;
    absa = absb;
    absb = tmp;
  }

  fl_t a = decompose(absa);
  int q = 0;
  for (q = 0; q <= 23 && (2 << q) <= a.mantissa; q++);

  int ulps = distance_ulps(a_, b_);
  int lu;
  for (lu = 0; lu <= 30 && (2 << (lu + 1)) <= a.mantissa; lu++);

  return q - (lu + 1) - 1;
}
```

## What we really do in the tests

As said above buggy intrinsics can be easily found. But the bugs appears for
corner cases typically involving NaNs and/or infinities. But according to the
philosophy of NSIMD, it is not the job of its standard operators to propose a
non buggy alternative to a buggy intrinsics. But we still have the problem of
testing. A consequence of the philosophy of NSIMD is that we only have to test
that intrinsics are correctly wrapped. We can reasonably assume that testing
for floatting point numbers on only normal numbers is more than sufficient.

Moreover, an implementation (buggy or not), may have different parameters set
that controls how floatting point arithmetic is done on various components of
the chip. An non exhaustive list includes:
- Rounding modes (which is not controlled by NSIMD as it is a library)
- FTZ/DAZ (flush to zero) denormal values never appear.
- FTZ/DAZ on some components (SIMD parts) and not others (scalar parts)
- Non IEEE behavior (eg. some NVIDIA GPU and ARMv7 chips)
- A mix of the above
- A buggy mix of the above

As a consequence we do not compare floats using the operator `=` nor do we
use a weird-buggy formula involving the machine epsilon. Instead we use
the algorithm above to make sure that the first bits are correct. More
precisely we use the following algorithm and its variants for float16 and
doubles where `ufp` stands for `units in the first place`.

```c
/* a_ and b_ must be IEEE754 and normal numbers */
int ufps(float a_, float b_) {
  unsigned int a, b;
  memcpy(&a, &a_, 4);
  memcpy(&b, &b_, 4);
  int ea = (int)((a >> 23) & 0xff);
  int eb = (int)((b >> 23) & 0xff);
  if (ea - eb > 1 || ea - eb < -1) {
    return 0;
  }
  int ma = (int)(a & 0x007fffff);
  int mb = (int)(b & 0x007fffff);
  int d = 0;
  if (ea == eb) {
    d = ma - mb;
  } else if (ea > eb) {
    d = 2 * ma - mb;
  } else {
    d = 2 * mb - ma);
  }
  d = (d >= 0 ? d : -d);
  int i = 0;
  for (; i < 30 && d >= (1 << i); i++);
  return 23 - i;
}
```


================================================
FILE: doc/markdown/memory.md
================================================
# Memory functions

Although the purpose of NSIMD is not to provide a full memory container
library, it provides some helper functions to facilitate the end-user. The
functions below only deals with CPUs. If your needs concerns GPUs or memory
transfers between CPUs and GPUs see the [memory management
module](module_memory_management_overview.md).

## Memory functions available in C and C++

- `void *nsimd_aligned_alloc(nsimd_nat n);`  
  Returns a pointer to `n` bytes of aligned memory. It returns NULL is an
  error occurs.

- `void nsimd_aligned_free(void *ptr);`
  Frees the memory pointed to by `ptr`.

## Memory functions available in C++

- `void *nsimd::aligned_alloc(nsimd_nat n);`  
  Returns a pointer to `n` bytes of aligned memory. It returns NULL is an
  error occurs.

- `void nsimd::aligned_free(void *ptr);`  
  Frees the memory pointed to by `ptr`.

- `template <typename T> T *nsimd::aligned_alloc_for(nsimd_nat n);`  
  Returns a pointer to `n` `T`'s of aligned memory. It returns NULL is an
  error occurs.

- `template <typename T> void nsimd::aligned_free_for(void *ptr);`  
  Free memory pointed to by `ptr`.

## C++ allocators for `std::vector`'s

NSIMD provides C++ allocators so that memory used by C++ container such as
`std::vector`'s will be suitably aligned in memory.

- `template <typename T> class nsimd::allocator;`  
  The class for allocating aligned memory inside C++ containers.

Exemple:

```c++
#include <nsimd/nsimd.h>

int main() {
  int n = // number of float's to allocate
  std::vector<float, nsimd::allocator<float> > myvector(size_t(n));
  
  // In what follows ptr is a pointer suitably aligned for the current SIMD
  // targeted architecture.
  float *ptr;
  
  // C++98
  ptr = &myvector[0]; 

  // C++11 and above
  ptr = myvector.data(); 
}
```

As there is no portable way of having aligned scoped memory, one can use the
NSIMD allocators to emulate such memory.

```c++
#include <nsimd/nsimd.h>

template <typename T, int N> void test() {
  std::vector<T, nsimd::allocator<T> > mem(size_t(N));
  T *ptr;
  
  // C++98
  ptr = &mem[0]; // scoped aligned memory

  // C++11 and above
  ptr = mem.data(); // scoped aligned memory
}

int main() {
  test<float, 16>();
  test<double, 8>();
}
```

## C++ scoped memory allocation

NSIMD provides a struct helper for the user to allocate a chunk of memory and
don't care about its release. It uses C++ RAII.

```c++
namespace nsimd {

template <typename T> class scoped_aligned_mem_for {

  template <typename I> scoped_aligned_mem(I n);
  // Construct a struct an array of n T's.

  T *get();
  // Return the pointer to access memory.

};

}

int main() {
  // Allocates 1024 floats in memory. It will be freed when the function (or
  // the program) terminates.
  nsimd::scoped_aligned_mem_for<float> buffer(1024);
  return 0;
}
```


================================================
FILE: doc/markdown/modules/.gitignore
================================================
*/api*.md

================================================
FILE: doc/markdown/modules/fixed_point/overview.md
================================================
<!--

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-->


# NSIMD fixed point module

## Description

This module implements a fixed-point numbers support for the `nsimd` library.
Fixed-point numbers are integer types used to represent decimal numbers. A number `lf` 
of bits are used to encode its integer part, and `rt` bits are used to encode its 
fractional part.

The fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf, rt>` to 
represent a fixed_point number. All the basic floating-point arithmetic operaors have 
been defined, therefore fp_t elements can be manipulated as normal numbers.
The fixed_point module will use a `int8_t`, `int16_t`, or `int32_t` integer type for 
storage, depending on the value of `lf + 2 * rt`. 

All the functions of the module are under the namespace `nsimd::fixed_point`, 
and match the same interface than `nsimd`.

The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd `fpsimd_t` 
struct type is defined in `simd.hpp`.

The modules redefines the `nsimd` pack type for fixed-point numbers, templated with `lf` 
and `rt` :

```C++
namespace nsimd {
namespace fixed_point {
template <uint8_t lf, uint8_t rt>
struct pack;
} // namespace fixed_point
} // namespace nsimd
```

Then, the pack can be manipulated as an `nsimd` pack like other scalar types. 

## Compatibility

The fixed point module is a C++ only API, compatible with the C++98 standard.
It has the same compilers and hardware support than the main `nsimd` API 
(see the [API index](../../index.md)).

## Example

Here is a minimal example(main.cpp) :

```C++
#include <ctime>
#include <cstdlib>
#include <iostream>
#include <nsimd/modules/fixed_point.hpp>

float rand_float() {
  return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f;        
}

int main() {
  // We use fixed point numbers with 8 bits of integer part and 8 bits of 
  // decimal part. It will use a 32 bits integer for internal storage.
  typedef nsimd::fixed_point::fp_t<8, 8> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> fp_pack_t;
  
  const size_t v_size = nsimd::fixed_point::len(fp_t());

  fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t));
  fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t));
  fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t));
  
  // Input and output initializations 
  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {
    input0[i] = fp_t(rand_float());
    input1[i] = fp_t(rand_float());
  }
  
  fp_pack_t v0 = nsimd::fixed_point::loadu<fp_pack_t>(input0);
  fp_pack_t v1 = nsimd::fixed_point::loadu<fp_pack_t>(input1);
  fp_pack_t vres = nsimd::fixed_point::add(v0, v1);
  nsimd::fixed_point::storeu(res, vres);
  
  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {
    std::cout << float(input0[i]) << " | "
      << float(input1[i]) << " | "
      << float(res[i]) << "\n";
  }
  std::cout << std::endl;
  
  return EXIT_SUCCESS;
}

```

To test with avx2 run : 
```bash
export NSIMD_ROOT=<path/to/simd>
g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp
./main
```

The console output will look like this : 
```console
$>./main 
1.35938 | -0.421875 | 0.9375
1.13281 | 1.19531 | 2.32812
1.64844 | -1.21094 | 0.4375
-0.660156 | 1.07422 | 0.414062
-0.890625 | 0.214844 | -0.675781
-0.0898438 | 0.515625 | 0.425781
-0.539062 | 0.0546875 | -0.484375
1.80859 | 1.66406 | 3.47266
```
        

================================================
FILE: doc/markdown/pack.md
================================================
# NSIMD pack and related functions

The advanced C++ API provides types that represents SIMD registers. These
types are struct that allows NSIMD to define infix operators. In this page
NSIMD concepts are reported in the documentation but you can think of them
as usual `typename`s.

## The Pack type

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct pack {
  // Typedef to retrieve the native SIMD type
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;

  // Typedef to retrieve T
  typedef T value_type;

  // Typedef to retrieve SimdExt
  typedef SimdExt simd_ext;

  // Static member to retrive N
  static const int unroll = N;

  // Ctor that splats `s`, the resulting vector will be [s, s, s, ...]
  template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s);

  // Ctor that takes a SIMD vector of native type
  // ONLY AVAILABLE when N == 1
  pack(simd_vector v);
  
  // Retrieve the underlying native SIMD vector
  // ONLY AVAILABLE when N == 1
  simd_vector native_register() const;

};
```

Example:

```c++
#include <nsimd/nsimd-all.hpp>
#include <iostream>

int main() {
  nsimd::pack<float> v(2.0f);
  std::cout << v << '\n';

  vf32 nv = v.native_register();
  nv = nsimd::add(nv, nv, f32());
  std::cout << nsimd::pack<f32>(nv) << '\n';

  return 0;
}
```

### Infix operators available for packs

- `pack operator+(pack const &, pack const &);`
- `pack operator*(pack const &, pack const &);`
- `pack operator-(pack const &, pack const &);`
- `pack operator/(pack const &, pack const &);`
- `pack operator-(pack const &);`
- `pack operator|(pack const &, pack const &);`
- `pack operator^(pack const &, pack const &);`
- `pack operator&(pack const &, pack const &);`
- `pack operator~(pack const &);`
- `pack operator<<(pack const &, int);` (only available for integers)
- `pack operator>>(pack const &, int);` (only available for integers)

### Assignment operators available for packs

- `pack operator+=(pack const &);`
- `pack operator-=(pack const &);`
- `pack operator*=(pack const &);`
- `pack operator/=(pack const &);`
- `pack &operator|=(pack const &other);`
- `pack &operator&=(pack const &other);`
- `pack &operator^=(pack const &other);`
- `pack &operator<<=(int);`
- `pack &operator>>=(int);`

### Function aliases

The C++ standard provides functions with different names that does exactly
the same thing. This is due to the retro compatibility with C. Take the
`fmin` C function as an example. In C this function give the minimum between
doubles only. The C++ standard provides overloads to this function so that it
can work on floats and long doubles. The aliases provided by NSIMD have the
same purpose but they are not provided as operator on their own because their
real purpose is to write generic code that can work on scalar and SIMD vector
types. As such they are only relevant for the advanced C++ API.

- `pack fmin(pack const &, pack const &);`
- `pack fmax(pack const &, pack const &);`
- `pack fabs(pack const &);`

They are contained in the `nsimd/cxx_adv_api_aliases.hpp` header and not
provided by default to respect the philosophy of NSIMD which is force the
use to think different between SIMD code and scalar code. They are provided
automatically when including `nsimd/nsimd-all.hpp`.

## The Packl type

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct packl {
  // Typedef to retrieve the native SIMD type
  typedef typename simd_traits<T, SimdExt>::simd_vectorl simd_vectorl;

  // Typedef to retrieve T
  typedef T value_type;

  // Typedef to retrieve SimdExt
  typedef SimdExt simd_ext;

  // Static member to retrive N
  static const int unroll = N;

  // Ctor that splats `s`, the resulting vector will be [s, s, s, ...]
  template <NSIMD_CONCEPT_VALUE_TYPE S> packl(S const &s);

  // Ctor that takes a SIMD vector of native type
  // ONLY AVAILABLE when N == 1
  packl(simd_vectorl v);
  
  // Retrieve the underlying native SIMD vector
  // ONLY AVAILABLE when N == 1
  simd_vector native_register() const;

};
```

Example:

```c++
#include <nsimd/nsimd-all.hpp>
#include <iostream>

int main() {
  nsimd::pack<float> v(2.0f);
  nsimd::packl<float> mask;

  mask = nsimd::eq(v, v);
  std::cout << v << '\n';

  mask = nsimd::neq(v, v);
  std::cout << v << '\n';

  return 0;
}
```

### Infix operators involving packls

- `packl operator&&(packl const &, packl const &);`
- `packl operator||(packl const &, packl const &);`
- `packl operator!(packl const &, packl const &);`
- `packl operator==(pack const &, pack const &);`
- `packl operator!=(pack const &, pack const &);`
- `packl operator<(pack const &, pack const &);`
- `packl operator<=(pack const &, pack const &);`
- `packl operator>(pack const &, pack const &);`
- `packl operator>=(pack const &, pack const &);`

## Packs for SoA/AoS

Types containing several SIMD vectors are also provided to help the user
manipulate arrays of structures. When working, let's say, on complex numbers,
loading them from memory with layout `RIRIRIRIRIRI...` can be done with the
`load2*` operators that will returns 2 SIMD vectors `RRRR` and `IIII` where
`R` stands for real part and `I` for imaginary part.

Similarily loading an RGB image from memory stored following the layout
`RGBRGBRGBRGB...` can be done with `load3*` to get 3 SIMD vectors `RRRR`,
`GGGG` and `BBBB`.

### Packx1

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx1 {

  // Usual typedefs and static members
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 1;

  // Member v0 for reading and writing
  pack<T, N, SimdExt> v0;
};
```

### Packx2

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx2 {

  // Usual typedefs and static members
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 2;

  // Members for reading and writing
  pack<T, N, SimdExt> v0;
  pack<T, N, SimdExt> v1;
};
```

### Packx3

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx3 {

  // Usual typedefs and static members
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 3;

  // Members for reading and writing
  pack<T, N, SimdExt> v0;
  pack<T, N, SimdExt> v1;
  pack<T, N, SimdExt> v2;
};
```

### Packx4

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx4 {

  // Usual typedefs and static members
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 4;

  // Members for reading and writing
  pack<T, N, SimdExt> v0;
  pack<T, N, SimdExt> v1;
  pack<T, N, SimdExt> v2;
  pack<T, N, SimdExt> v3;
};
```

### Functions involving packx2, packx3 and packx4

The following functions converts packxs into unrolled packs. The difference
between the `to_pack` and `to_pack_interleave` families of functions is in
the way they flatten (or deinterleave) the structure of SIMD vectors.

```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2 * N, SimdExt> to_pack(const packx2<T, N, SimdExt> &);

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3 * N, SimdExt> to_pack(const packx3<T, N, SimdExt> &);

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4 * N, SimdExt> to_pack(const packx4<T, N, SimdExt> &);

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2 * N, SimdExt> to_pack_interleave(const packx2<T, N, SimdExt> &);

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3 * N, SimdExt> to_pack_interleave(const packx3<T, N, SimdExt> &);

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4 * N, SimdExt> to_pack_interleave(const packx4<T, N, SimdExt> &);
```

The `to_pack` family of functions performs the following operations:

```
packx2<T, 3> = | v0 = [u0 u1 u2] | ---> [u0 u1 u2 w0 w1 w2] = pack<T, 6>
               | v1 = [w0 w1 w2] |
```

while the `to_pack_interleave` family of functions does the following:

```
packx2<T, 3> = | v0 = [u0 u1 u2] | ---> [u0 w0 v1 w1 v2 w2] = pack<T, 6>
               | v1 = [w0 w1 w2] |
```


================================================
FILE: doc/markdown/tutorial.md
================================================
<!--

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

-->

# NSIMD tutorial

In this tutorial we will write and compile a simple SIMD kernel to become
familiar with the basics of NSIMD. We will also see different aspects of SIMD
programming:
- aligned vs. unaligned data access
- basic SIMD arithmetic
- SIMD loops
- SIMD branching
- architecture selection at runtime

## SIMD basics

SIMD programming means using the CPU SIMD registers to performs operations
on several data at once. A SIMD vector should be viewed as a set of bits
which are interpreted by the operators that operate on them. Taking a 128-bits
wide SIMD register, it can be interpreted as:
- 16 signed/unsigned chars
- 8 signed/unsigned shorts
- 4 signed/unsigned ints
- 4 floats
- 2 signed/unsigned longs
- 2 doubles
as shown in the picture below.

![Register layout](img/register.png)

## Computation kernel

We will explain the rewriting of the following kernel which uppercases ASCII
letters only.

@[INCLUDE_CODE:L7:L16](../../examples/tutorial.cpp)

Here is the corresponding SIMD version. Explanations to follow.

@[INCLUDE_CODE:L18:L39](../../examples/tutorial.cpp)

## Getting started with NSIMD

All APIs of NSIMD core is available with this include:

@[INCLUDE_CODE:L1:L1](../../examples/tutorial.cpp)

For ease of programming with use the NSIMD namespace inside the
`uppercase_simd` function.

@[INCLUDE_CODE:L20:L20](../../examples/tutorial.cpp)

## SIMD vectors

A `nsimd::pack<T>` can be considered analogous to a SIMD register (on your or
any other machine). Operations performed on packs - from elementary operations
such as addition to complicated functions such as `nsimd::rsqrt11(x)` - will be
performed using SIMD registers and operations if supported by your hardware. As
shown below, data must be manually loaded into and stored from these registers.
Again, for ease of programming we typedef a pack of T's.

@[INCLUDE_CODE:L21:L21](../../examples/tutorial.cpp)

NSIMD provides another type of pack called `nsimd::packl` which handles vectors
of booleans.

@[INCLUDE_CODE:L22:L22](../../examples/tutorial.cpp)

This distinction between pack's and packl's is necessary ffor two reasons:
- On recent hardware, SIMD vectors of booleans are handled by dedicated
  registers.
- Pack and Packl must have different semantics as arithmetic operators on
  booleans have no sense as well as logical operators on Pack's.

## Loading data from memory

One way to construct a `nsimd::pack<T>` is to simply declare
(default-construct) it. Such a pack may *not* be zero-initialized and thus may
*contain arbitrary values*.

Another way to construct a `nsimd::pack<T>` is to fill it with a single value.
This so-called splatting constructor takes one scalar value and replicates it
in all elements of the pack.

But most common usage to construct a `nsimd::pack<T>` is by using the copy
constructor from loading functions.

@[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp)

## Aligned vs. unaligned memory

Alignement of a given pointer `ptr` to memory to some value `A` means that
`ptr % A == 0`. On older hardware loading data from unaligned memory can
result in performance penalty. On recent hardware it is hard to exhibit a
difference. NSIMD provides two versions of "load":
- `loada` for loading data from aligned memory
- `loadu` for loading data from unaligned momery
Note that using `loada` on unaligned pointer may result in segfaults. As
recent hardware have good support for unaligned memory we use `loadu`.

@[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp)

To ensure that data allocated by `std::vector` is aligned, NSIMD provide
a C++ allocator.

```c++
std::vector<T, nsimd::allocator<T> > data;
```

When loading data from memory you must ensure that there is sufficient data in
the block of memory you load from to fill a `nsimd::pack<T>`. For example, on
an `AVX` capable machine, a SIMD vector of `float` (32 bits) contains 8
elements. Therefore, there must be at least 8 floats in the memory block you
load data from otherwise loading may result in segfaults. More on this below.

## Operations on pack's and packl's

Once initialized, `nsimd::pack<T>` instances can be used to perform arithmetic.
Usual operations are provided by NSIMD such:
- addition
- substraction
- multiplication
- division
- square root
- bitwise and/or/xor
- ...

@[INCLUDE_CODE:L28:L29](../../examples/tutorial.cpp)

C++ operators are also overloaded for pack's and packl's as well as between
pack's and scalars or packl's and booleans.

## SIMD branching

NSIMD provide the `if_else` operator which fill the output, lane by lane,
according to the lane value of its first argument:
- if it is true, the output lane will be filled with the second argument's lane
- if it is false, the output lane will be filled with the third argument's lane
Therefore the branching:

@[INCLUDE_CODE:L10:L14](../../examples/tutorial.cpp)

will be rewritten as

@[INCLUDE_CODE:L28:L30](../../examples/tutorial.cpp)

or as a one liner

@[INCLUDE_CODE:L36:L36](../../examples/tutorial.cpp)

## SIMD loops

A SIMD loop is similar to its scalar counterpart except that instead of
going through data one element at a time it goes 4 by 4 or 8 by 8 elements
at a time. More precisely SIMD loops generally goes from steps equal to
pack's length. Therefore the scalar loop

@[INCLUDE_CODE:L9:L9](../../examples/tutorial.cpp)

is rewritten as

@[INCLUDE_CODE:L23:L26](../../examples/tutorial.cpp)

Note that going step by step will only cover most of the data except maybe the
tail of data in case that the number of elements is not a multiple of the
Pack's length. Therefore to perform computations on the tail one has to
load data from only `n` elements where `n < len<p_t>()`. One can use
`maskz_loadu` which will load data only on lanes that are marked as true by
another argument to the function.

@[INCLUDE_CODE:L35:L35](../../examples/tutorial.cpp)

The mask can be computed manually but NSIMD provides a function for it.

@[INCLUDE_CODE:L34:L34](../../examples/tutorial.cpp)

Then the computation on the tail is exactly the same as within the loop. Put
together it gives for the tail:

@[INCLUDE_CODE:L34:L37](../../examples/tutorial.cpp)

Then the entire loop reads as follows.

@[INCLUDE_CODE:L25:L37](../../examples/tutorial.cpp)

## Compiling the Code

Here is the complete listing of the code.

@[INCLUDE_CODE](../../examples/tutorial.cpp)

The compilation of a program using `nsimd` is like any other library.

```bash
c++ -O3 -DAVX2 -mavx2 -L/path/to/lib -lnsimd_avx2 -I/path/to/include tutorial.cpp
```

When compiling with NSIMD, you have to decide at compile time the targeted
SIMD extensions, AVX2 in the example above. It is therefore necessary to
give `-mavx2` to the compiler for it to emit AVX2 instructions. To tell NSIMD
that AVX2 has to be used the `-DAVX2` has to be passed to the compiler. For
an exhaustive list of defines controlling compilation see <defines.md>. There
is a .so file for each SIMD extension, it is therefore necessary to link
against the proper .so file.

## Runtime selection of SIMD extensions

It is sometimes necessary to have several versions of a given algorithm for
different SIMD extensions. This is rather to do with NSIMD. Basically the
idea is to write the algorithm in a generic manner using pack's as shown above.
It is then sufficient to compile the same soure file for different SIMD
extensions and then link the resulting object files altogether. Suppose that
a file named `uppercase.cpp` contains the following code:

@[INCLUDE_CODE:L18:L38](../../examples/tutorial.cpp)

This would give the following in a Makefile.

```makefile
all: uppercase

uppercase_sse2.o: uppercase.cpp
	c++ -O3 -DSSE2 -msse2 -c $? -o $@

uppercase_sse42.o: uppercase.cpp
	c++ -O3 -DSSE42 -msse4.2 -c $? -o $@

uppercase_avx.o: uppercase.cpp
	c++ -O3 -DAVX -mavx -c $? -o $@

uppercase_avx2.o: uppercase.cpp
	c++ -O3 -DAVX2 -mavx2 -c $? -o $@

uppercase: uppercase_sse2.o \
           uppercase_sse42.o \
           uppercase_avx.o \
           uppercase_avx2.o
           main.cpp
	c++ $? -lnsimd_avx2 -o $@
```

Note that `libnsimd_avx2` contains all the functions for SSE 2, SSE 4.2, AVX
and AVX2. This is a consequence of the retrocompatiblity of Intel SIMD
extensions. The situation is the same on ARM where `libnsimd_sve.so` will
contain functions for AARCH64.

There is a small caveat. The symbol name corresponding to the `uppercase_simd`
function will be same for all the object files which will result in error
when linking together all objects. To avoid this situation one can use
function overloading as follows:

```c++
template <typename T>
void uppercase_simd(NSIMD_SIMD, T *dst, const T *src, int n) {
  // ...
}
```

The macro `NSIMD_SIMD` will be expanded to a type containing the information on
the SIMD extension currently requested by the user. This techniques is called
tag dispatching and does not require *any* modification of the algorithm
inside the function. Finally in `main` one has to do dispatching by using
either `cpuid` of by another mean.

```c++
int main() {
  // what follows is pseudo-code
  switch(cpuid()) {
  case cpuid_sse2:
    uppercase(nsimd::sse2, dst, src, n);
    break;
  case cpuid_sse42:
    uppercase(nsimd::sse42, dst, src, n);
    break;
  case cpuid_avx:
    uppercase(nsimd::avx, dst, src, n);
    break;
  case cpuid_avx2:
    uppercase(nsimd::avx2, dst, src, n);
    break;
  }
  return 0;
}
```


================================================
FILE: doc/md2html.cpp
================================================
/*

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <ns2.hpp>

#include <stdexcept>
#include <utility>
#include <string>
#include <vector>

// ----------------------------------------------------------------------------

// Extract lines form strings like ":L7:L42"
// Returns -1 if fails
std::pair<int, int> extract_lines(std::string const &s) {
  std::pair<int, int> r(-1, -1);
  std::vector<std::string> lines = ns2::split(s, ":L");
  if (lines.size() == 3 && lines[0] == "") {
    try {
      r.first = std::stoi(lines[1]);
      r.second = std::stoi(lines[2]);
    } catch (std::exception const &) {
      r.first = -1;
      r.second = -1;
    }
  }
  return r;
}

// ----------------------------------------------------------------------------

std::string callback_input_filename = "";

std::string callback_macro(std::string const &label, std::string const &url,
                           ns2::markdown_infos_t const &markdown_infos) {
  std::string filename;
  if (ns2::startswith(label, "INCLUDE")) {
    filename = ns2::join_path(ns2::dirname(callback_input_filename), url);
  }

  std::string lang;
  if (ns2::startswith(label, "INCLUDE_CODE")) {
    std::string const ext = ns2::splitext(filename).second;
    if (ext == "sh") {
      lang = "Bash";
    } else if (ext == "c" || ext == "h") {
      lang = "C";
    } else if (ext == "cpp" || ext == "hpp") {
      lang = "C++";
    } else if (ext == "py") {
      lang = "Python";
    }
  }

  if (ns2::startswith(label, "INCLUDE_CODE:")) {
    std::string const lines_str = label.substr(label.find(':'));
    std::pair<int, int> const l_first_last = extract_lines(lines_str);
    if (l_first_last.first == -1) {
      throw std::runtime_error("cannot extract first line number");
    }
    if (l_first_last.second == -1) {
      throw std::runtime_error("cannot extract last line number");
    }
    std::string out;
    std::string lines;
    {
      ns2::ifile_t in(filename);
      int num_line = 1;
      std::string line;
      while (std::getline(in, line)) {
        if (num_line == l_first_last.second) {
          lines += line;
        } else if (num_line < l_first_last.second) {
          if (num_line >= l_first_last.first) {
            lines += line + "\n";
          }
        } else {
          break;
        }
        ++num_line;
      }
    }
    ns2::compile_markdown("```" + lang + "\n" + ns2::deindent(lines) +
                              "\n```\n",
                          &out, markdown_infos);
    return out;
  }

  if (ns2::startswith(label, "INCLUDE_CODE")) {
    std::string out;
    ns2::compile_markdown("```" + lang + "\n" + ns2::read_file(filename) +
                              "\n```\n",
                          &out, markdown_infos);
    return out;
  }

  if (ns2::startswith(label, "INCLUDE")) {
    ns2::ifile_t in(filename);
    std::ostringstream out;
    ns2::compile_markdown(&in, &out, markdown_infos);
    return out.str();
  }

  return "";
}

// ----------------------------------------------------------------------------

std::pair<std::string, bool>
callback_link(std::string const &label, std::string const &url,
              ns2::markdown_infos_t const &markdown_infos) {
  if (markdown_infos.output_format != ns2::HTML) {
    return std::pair<std::string, bool>("", false);
  }

  std::pair<std::string, std::string> root_basename_ext = ns2::splitext(url);
  if (root_basename_ext.second == "md") {
    return std::pair<std::string, bool>(
        ns2::html_href(root_basename_ext.first + ".html", label), true);
  } else {
    return std::pair<std::string, bool>("", false);
  }
}

// ----------------------------------------------------------------------------

int main(int argc, char **argv) {
  if (argc != 3) {
    std::cout << "Usage: " << argv[0] << " <input_file> <output_file>"
              << std::endl;
    return 1;
  }

  std::string const input_filename = argv[1];
  std::string const output_filename = argv[2];

  ns2::ifile_t input_file(input_filename);
  ns2::ofile_t output_file(output_filename);

  std::cout << "Convert \"" << input_filename << "\" to \"" << output_filename
            << "\"" << std::endl;

  callback_input_filename = input_filename;
  ns2::markdown_infos_t markdown_infos(ns2::HTML, callback_macro,
                                       callback_link, true);

  ns2::compile_markdown(&input_file, &output_file, markdown_infos);

  return 0;
}


================================================
FILE: doc/what_is_wrapped.cpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

/*

This little C++ program reads and parses files from NSIMD wrapping intrinsics
in order to build a markdown page describing in a table which operators are
just intrinsics wrapper and which one are more complicated. We only to parse
C code so no need for complicated stuff. Moreover what we doo is really simple
and a C parser is not needed.

We replace all C delimiters by spaces, then split the resulting string into
words and we get a vector of strings. Then search in it the function that we
want (say nsimd_add_sse2_f32) along with its opening curly and closing
brakets and finally:
- if there is only one token then it must be an intrinsic
- if there is a for then it must use emulation
- if there are several tokens but no for it must be a trick using other
  intrinsics

The produced markdown contains:
- E for emulation
- T for trick with other intrinsics
- NOOP for noop
- a link to the Intel/Arm documentation about the intrinsic otherwise

Well all that to say that a few hundreds of simple C++ code is more that
enough for our need and we don't need to depend on some C/C++ parser such
as Clang. Note that using a real parser will be counter productive as some
intrinsics are implemented as macros to compiler builtin which then appear
in the AST instead of the documented intrinsics.

This code is completely non-optimized and we don't care because it does not
take time to execute and it is not our purpose to optimize this code.

*/

// ----------------------------------------------------------------------------

#include <ns2.hpp>

#include <utility>
#include <string>
#include <vector>

// ----------------------------------------------------------------------------

#define MAX_LEN (11 * 11)

typedef std::map<std::string, std::string[MAX_LEN]> table_t;

std::string type_names_str("i8,u8,i16,u16,i32,u32,i64,u64,f16,f32,f64");
std::vector<std::string> types_list(ns2::split(type_names_str, ","));

const size_t not_found = ~((size_t)0);

// ----------------------------------------------------------------------------

int nbits(std::string const &typ) {
  if (typ == "i8" || typ == "u8") {
    return 8;
  } else {
    return (10 * (typ[1] - '0')) + (typ[2] - '0');
  }
}

// ----------------------------------------------------------------------------

std::vector<std::string> get_types_names(std::string const &output) {
  std::vector<std::string> const& list = types_list;
  if (output == "same") {
    return list;
  }
  std::vector<std::string> ret;
  for (size_t i = 0; i < list.size(); i++) {
    for (size_t j = 0; j < list.size(); j++) {
      if ((output == "same_size" && nbits(list[j]) == nbits(list[i])) ||
          (output == "bigger_size" && nbits(list[j]) == 2 * nbits(list[i])) ||
          (output == "lesser_size" && 2 * nbits(list[j]) == nbits(list[i]))) {
        ret.push_back(list[j] + "_" + list[i]);
      }
    }
  }
  return ret;
}

// ----------------------------------------------------------------------------

size_t find(std::vector<std::string> const &haystack,
            std::string const &needle, size_t i0 = 0) {
  for (size_t i = i0; i < haystack.size(); i++) {
    if (haystack[i] == needle) {
      return i;
    }
  }
  return not_found;
}

// ----------------------------------------------------------------------------

size_t find_by_prefix(std::vector<std::string> const &needles,
                      std::string const &haystack) {
  for (size_t i = 0; i < needles.size(); i++) {
    if (ns2::startswith(haystack, needles[i])) {
      return i;
    }
  }
  return not_found;
}

// ----------------------------------------------------------------------------

int is_number(std::string const &s) {
  for (size_t i = 0; i < s.size(); i++) {
    if (s[i] != 'x' && s[i] != 'l' && s[i] != 'L' && s[i] != 'u' &&
        s[i] != 'U' && !(s[i] >= '0' && s[i] <= '9')) {
      return false;
    }
  }
  return true;
}

// ----------------------------------------------------------------------------

int is_macro(std::string const &s) {
  for (size_t i = 0; i < s.size(); i++) {
    if (s[i] != '_' || !(s[i] >= 'A' && s[i] <= 'Z')) {
      return false;
    }
  }
  return true;
}

// ----------------------------------------------------------------------------

void parse_file(std::string const &input_vars, std::string const &simd_ext,
                std::vector<std::string> const &types_names,
                std::string const &op_name, std::string const &filename,
                table_t *table_) {
  table_t &table = *table_;
  std::string content(ns2::read_file(filename));

  // replace all C delimiters by spaces except {}
  for (size_t i = 0; i < content.size(); i++) {
    const char delims[] = "()[];,:+-*/%&|!%\n\t\r";
    for (size_t j = 0; j < sizeof(delims); j++) {
      if (content[i] == delims[j]) {
        content[i] = ' ';
        break;
      }
    }
  }

  // replace '{' by ' { ' and same for '}' in case there are some code
  // just before/after it
  content = ns2::replace(ns2::replace(content, "}", " } "), "{", " { ");

  // now split string on spaces and removes some tokens
  std::vector<std::string> to_be_removed(
      ns2::split("return,signed,unsigned,char,short,int,long,float,double,"
                 "const,void,__vector,__bool,bool,vector" +
                     type_names_str + "," + input_vars,
                 ','));
  std::vector<std::string> to_be_removed_by_prefix(ns2::split(
      "_mm_cast,_mm256_cast,_mm512_cast,vreinterpret,svreinterpret,svptrue_",
      ','));
  std::vector<std::string> tokens;
  { // to free tokens0 afterwards
    std::vector<std::string> tokens0 = ns2::split(content, ' ');
    for (size_t i = 0; i < tokens0.size(); i++) {
      // We also remove svptrue_* as they are everywhere for SVE and all
      // casts as they incur no opcode and are often used for intrinsics
      // not supporting certain types
      if (tokens0[i].size() == 0 || is_number(tokens0[i]) ||
          is_macro(tokens0[i]) ||
          find_by_prefix(to_be_removed_by_prefix, tokens0[i]) != not_found ||
          find(to_be_removed, tokens0[i]) != not_found) {
        continue;
      }
      tokens.push_back(tokens0[i]);
    }
  }

  // finally search for intrinsics
  for (size_t typ = 0; typ < types_names.size(); typ++) {
    std::string func_name("nsimd_" + op_name + "_" + simd_ext + "_" +
                          types_names[typ]);

    // find func_name
    size_t pos = find(tokens, func_name);
    if (pos == not_found) {
      table[op_name][typ] = "NA";
      continue;
    }

    // find opening {
    size_t i0 = find(tokens, "{", pos);
    if (i0 == not_found) {
      std::cerr << "WARNING: cannot find opening '{' for '" << func_name
                << "' in '" << filename << "'\n";
      table[op_name][typ] = "NA";
      continue;
    }

    // find closing }
    size_t i1 = i0;
    int nest = 0;
    for (i1 = i0; i1 < tokens.size(); i1++) {
      if (tokens[i1] == "{") {
        nest++;
      } else if (tokens[i1] == "}") {
        nest--;
      }
      if (nest == 0) {
        break;
      }
    }

    // if there is no token inside {} then it must be a noop
    // if there is only one token inside {} then it must be the intrinsic
    // if there is a for loop then it must be emulation
    // if there are several tokens but no for then it must be a trick
    if (i0 + 1 == i1) {
      table[op_name][typ] = "NOOP";
    } else if (i0 + 2 == i1 && !ns2::startswith(tokens[i0 + 1], "nsimd_")) {
      table[op_name][typ] = "[`" + tokens[i0 + 1] + "`]";
      if (simd_ext == "neon128" || simd_ext == "aarch64") {
        table[op_name][typ] +=
            "(https://developer.arm.com/architectures/instruction-sets/"
            "intrinsics/" + tokens[i0 + 1] + ")";
      } else if (ns2::startswith(simd_ext, "sve")) {
        table[op_name][typ] +=
            "(https://developer.arm.com/documentation/100987/0000)";
      } else if (simd_ext == "sse2" || simd_ext == "sse42" ||
                 simd_ext == "avx" || simd_ext == "avx2" ||
                 simd_ext == "avx512_knl" || simd_ext == "avx512_skylake") {
        table[op_name][typ] += "(https://software.intel.com/sites/landingpage/"
                               "IntrinsicsGuide/#text=" +
                               tokens[i0 + 1] + ")";
      } else if (simd_ext == "vsx" || simd_ext == "vmx") {
        table[op_name][typ] +=
            "(https://www.ibm.com/docs/en/xl-c-aix/13.1.3?topic=functions-" +
            ns2::replace(tokens[i0 + 1], "_", "-") + ")";
      }
    } else {
      if (find(std::vector<std::string>(tokens.begin() + i0,
                                        tokens.begin() + (i1 + 1)),
               "for") != not_found) {
        table[op_name][typ] = "E";
      } else {
        table[op_name][typ] = "T";
      }
    }
  }
}

// ----------------------------------------------------------------------------

std::string md_row(int nb_col, std::string const &cell_content) {
  std::string ret("|");
  for (int i = 0; i < nb_col; i++) {
    ret += cell_content + "|";
  }
  return ret;
}

// ----------------------------------------------------------------------------

int main(int argc, char **argv) {
  if ((argc % 2) != 0 || argc <= 5) {
    std::cout
        << "Usage: " << argv[0]
        << " a0,a1,a2 simd_ext output_type operator1 file1 operator2 file2 "
           "...\n"
        << "where output_type is (same|same_size|bigger_size|lesser_size)"
        << std::endl;
    return 1;
  }

  std::string input_vars(argv[1]);
  std::string simd_ext(argv[2]);
  std::string output_type(argv[3]);
  std::vector<std::string> types_names = get_types_names(output_type);
  table_t table;

  for (int i = 4; i < argc; i += 2) {
    parse_file(input_vars, simd_ext, types_names, argv[i], argv[i + 1],
               &table);
  }

  for (table_t::const_iterator it = table.begin(); it != table.end(); it++) {
    std::cout << "## " << it->first << "\n\n";
    if (output_type == "same") {
      const std::string(&row)[MAX_LEN] = it->second;
      for (size_t i = 0; i < types_list.size(); i++) {
        std::cout << "-  " << it->first << " on **" << types_list[i]
                  << "**: " << row[i] << "\n";
      }
      std::cout << "\n\n";
    } else {
      const std::string(&row)[MAX_LEN] = it->second;
      for (size_t i = 0; i < types_list.size(); i++) {
        for (size_t j = 0; j < types_list.size(); j++) {
          std::string cell_content;
          std::string typ(types_list[j] + "_" + types_list[i]);
          for (size_t k = 0; k < types_names.size(); k++) {
            if (typ == types_names[k]) {
              cell_content = row[k];
              break;
            }
          }
          if (cell_content.size() > 0) {
            std::cout << "-  " << it->first << " from **" << types_list[i]
                      << "** to **" << types_list[j] << "**: " << cell_content
                      << "\n";
          }
        }
        std::cout << "\n";
      }
      std::cout << "\n";
    }
  }

  return 0;
}


================================================
FILE: egg/__init__.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from . import operators


================================================
FILE: egg/common.py
================================================
# Use utf-8 encoding
# -*- coding: utf-8 -*-

# Copyright (c) 2020 Agenium Scale
#
# permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# -----------------------------------------------------------------------------

# What does this script?
# ----------------------
#
# This is only a python module that holds what is shared by `generate.py`,
# the `platform_*.py` files and all other python code in `egg`. If contains
# the list of supported types, functions, operators, and some useful helper
# functions such as the python equivalent of `mkdir -p`.

# -----------------------------------------------------------------------------
# Import section

import math
import os
import sys
import io
import collections
import platform
import string
import shutil
import math

# -----------------------------------------------------------------------------
# print

def myprint(opts, obj):
    if opts.list_files:
        return
    print('-- {}'.format(obj))

# -----------------------------------------------------------------------------
# check if file exists

def can_create_filename(opts, filename):
    if opts.list_files:
        print(filename)
        return False
    if opts.verbose:
        sys.stdout.write('-- {}: '.format(filename))
    if os.path.isfile(filename) and not opts.force:
        if opts.verbose:
            sys.stdout.write('skipping\n')
        return False
    elif opts.force:
        if opts.verbose:
            sys.stdout.write('creating (forced)\n')
        return True
    else:
        if opts.verbose:
            sys.stdout.write('creating (missing)\n')
        return True

# -----------------------------------------------------------------------------
# open with UTF8 encoding

def open_utf8(opts, filename):
    dummy, ext = os.path.splitext(filename)
    if ext.lower() in ['.c', '.h', '.cpp', '.hpp', '.cc', '.cxx', '.hxx',
                       '.hpp']:
        begin_comment = '/*'
        end_comment = '*/'
    elif ext.lower() in ['.md', '.htm', '.html']:
        begin_comment = '<!--'
        end_comment = '-->'
    else:
        begin_comment = None
    with io.open(filename, mode='w', encoding='utf-8') as fout:
        if begin_comment is not None:
            if opts.simple_license:
                fout.write('''{}

Copyright (c) 2021 Agenium Scale

{}

'''.format(begin_comment, end_comment))
            else:
                fout.write('''{}

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

{}

'''.format(begin_comment, end_comment))

        fout.write('{} This file has been auto-generated {}\n\n'.\
            format(begin_comment, end_comment))

    return io.open(filename, mode='a', encoding='utf-8')

# -----------------------------------------------------------------------------
# clang-format

def clang_format(opts, filename, cuda=False):
    with io.open(filename, 'a', encoding='utf-8') as fout:
        fout.write('\n')
    if not opts.enable_clang_format:
        # TODO: not sure if needed to implement a smarter call to clang-format
        if cuda:
            os.system('clang-format -style="{{ Standard: Cpp11 }}" -i {}'. \
                      format(filename))
        else:
            os.system('clang-format -style="{{ Standard: Cpp03 }}" -i {}'. \
                      format(filename))
    if cuda:
        shutil.copyfile(filename, filename[:-4] + '.cu')

# -----------------------------------------------------------------------------
# Not implemented response

NOT_IMPLEMENTED = 'abort();'

# -----------------------------------------------------------------------------
# C/C++ comment hbar

hbar = '/* ' + ('-' * 73) + ' */'

# -----------------------------------------------------------------------------
# Convert constants for operators

OUTPUT_TO_SAME_TYPE       = 0
OUTPUT_TO_SAME_SIZE_TYPES = 1
OUTPUT_TO_UP_TYPES        = 2
OUTPUT_TO_DOWN_TYPES      = 3

# -----------------------------------------------------------------------------
# SIMD type

x86_simds = [
    'sse2',
    'sse42',
    'avx',
    'avx2',
    'avx512_knl',
    'avx512_skylake',
]

arm_simds = [
    'neon128',
    'aarch64',
    'sve',
    'sve128',
    'sve256',
    'sve512',
    'sve1024',
    'sve2048'
]

ppc_simds = [
    'vmx',
    'vsx',
]

simds = ['cpu'] + x86_simds + arm_simds + ppc_simds

simds_deps = {
    'cpu': ['cpu'],
    'sse2': ['cpu', 'sse2'],
    'sse42': ['cpu', 'sse2', 'sse42'],
    'avx': ['cpu', 'sse2', 'sse42', 'avx'],
    'avx2': ['cpu', 'sse2', 'sse42', 'avx', 'avx2'],
    'fma4': [],
    'avx512_knl': ['cpu', 'sse2', 'sse42', 'avx', 'avx2', 'avx512_knl'],
    'avx512_skylake': ['cpu', 'sse2', 'sse42', 'avx', 'avx2', 'avx512_skylake'],
    'neon128': ['cpu', 'neon128'],
    'aarch64': ['cpu', 'aarch64'],
    'sve': ['cpu', 'aarch64', 'sve'],
    'sve128': ['cpu', 'aarch64', 'sve128'],
    'sve256': ['cpu', 'aarch64', 'sve256'],
    'sve512': ['cpu', 'aarch64', 'sve512'],
    'sve1024': ['cpu', 'aarch64', 'sve1024'],
    'sve2048': ['cpu', 'aarch64', 'sve2048'],
    'vmx': ['cpu', 'vmx'],
    'vsx': ['cpu', 'vmx', 'vsx']
}

ftypes = ['f64', 'f32', 'f16']
ftypes_no_f16 = ['f64', 'f32']
itypes = ['i64', 'i32', 'i16', 'i8']
utypes = ['u64', 'u32', 'u16', 'u8']
iutypes = itypes + utypes
types = ftypes + iutypes

def logical(typ):
    return 'l{}'.format(typ)

signed_type = {
    'i8': 'i8',
    'u8': 'i8',
    'i16': 'i16',
    'u16': 'i16',
    'i32': 'i32',
    'u32': 'i32',
    'i64': 'i64',
    'u64': 'i64',
    'f16': 'f16',
    'f32': 'f32',
    'f64': 'f64'
}

bitfield_type = {
    'i8': 'u8',
    'u8': 'u8',
    'i16': 'u16',
    'u16': 'u16',
    'i32': 'u32',
    'u32': 'u32',
    'i64': 'u64',
    'u64': 'u64',
    'f16': 'u16',
    'f32': 'u32',
    'f64': 'u64'
}

in0 = 'a0'
in1 = 'a1'
in2 = 'a2'
in3 = 'a3'
in4 = 'a4'
in5 = 'a5'

CPU_NBITS = 128

if CPU_NBITS != 128:
    raise ValueError('CPU_NBITS must be 128')

def get_arg(i):
    fmtspec = { 'in0': in0, 'in1': in1, 'in2': in2, 'in3': in3, 'in4': in4,
                'in5': in5 }
    return '{{in{}}}'.format(i).format(**fmtspec)

def get_args(n):
    fmtspec = { 'in0': in0, 'in1': in1, 'in2': in2, 'in3': in3, 'in4': in4,
                'in5': in5 }
    return ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                      for i in range(0, n)])

def get_simds_deps_from_opts(opts):
    simds = set()
    for simd1 in opts.simd:
        for simd2 in simds_deps[simd1]:
            simds.add(simd2)
    return simds

def bitsize(typ):
    if not (typ in types):
        raise ValueError('Unknown type "{}"'.format(typ))
    return int(typ[1:])

def sizeof(typ):
    return bitsize(typ) // 8

def ilog2(x):
    if x <= 0:
        return None
    for i in range(0, x):
        if 2 ** (i + 1) > x:
            return i

#def get_same_size_types(typ):
#    nbits = typ[1:]
#    if typ in ['i8' ,'u8']:
#        return ['i8', 'u8']
#    else:
#        return ['i' + nbits, 'u' + nbits, 'f' + nbits]

def get_output_types(from_typ, output_to):
    if output_to == OUTPUT_TO_SAME_TYPE:
        return [from_typ]
    else:
        nbits = from_typ[1:]
        if output_to == OUTPUT_TO_SAME_SIZE_TYPES:
            if from_typ in ['i8' ,'u8']:
                return ['i8', 'u8']
            else:
                return ['i' + nbits, 'u' + nbits, 'f' + nbits]
        elif output_to == OUTPUT_TO_UP_TYPES:
            if nbits == '64':
                raise ValueError('No uptype for ' + from_typ)
            else:
                n = str(int(nbits) * 2)
                return ['i' + n, 'u' + n, 'f' + n]
        elif output_to == OUTPUT_TO_DOWN_TYPES:
            n = str(int(nbits) // 2)
            if nbits == '8':
                raise ValueError('No downtype for ' + from_typ)
            elif nbits == '16':
                return ['i' + n, 'u' + n]
            else:
                return ['i' + n, 'u' + n, 'f' + n]
        else:
            raise ValueError('Invalid argument for "output_to": {}'. \
                             format(output_to))

# -----------------------------------------------------------------------------
# mkdir -p (avoid a dependency for just one function)

def mkdir_p(path):
    if os.path.isdir(path):
        return path
    head, tail = os.path.split(path)
    if head != '':
        mkdir_p(head)
    os.mkdir(path)
    return path

# -----------------------------------------------------------------------------
# Replacement of enumerate

def enum(l):
    ret = []
    for i in range(0, len(l)):
        ret.append([i, l[i]])
    return ret

# -----------------------------------------------------------------------------
# List of supported SIMD operators/functions

# v   = SIMD vector parameter
# vi  = SIMD vector of signed integers parameter
# vx2 = struct of 2 SIMD vector parameters
# vx3 = struct of 3 SIMD vector parameters
# vx4 = struct of 4 SIMD vector parameters
# l   = SIMD vector of logicals parameter
# s   = Scalar parameter
# *   = Pointer to scalar parameter
# c*  = Pointer to const scalar parameter
# _   = void (only for return type)
# p   = Parameter (int)


# -----------------------------------------------------------------------------
# Type generators

def get_one_type_generic(param, typ):
    if param == '_':
        return 'void'
    elif param == 'p':
        return 'int'
    elif param == 's':
        return typ
    elif param == '*':
        return '{}*'.format(typ)
    elif param == 'c*':
        return '{} const*'.format(typ)
    elif param == 'vi':
        return 'vi{}'.format(typ[1:])
    elif param == 'v':
        return 'v{}'.format(typ)
    elif param == 'vx2':
        return 'v{}x2'.format(typ)
    elif param == 'vx3':
        return 'v{}x3'.format(typ)
    elif param == 'vx4':
        return 'v{}x4'.format(typ)
    elif param == 'l':
        return 'vl{}'.format(typ)
    else:
        raise ValueError("Unknown param '{}'".format(param))

def get_one_type_specific(param, ext, typ):
    if param == '_':
        return 'void'
    elif param == 'p':
        return 'int'
    elif param == 's':
        return typ
    elif param == '*':
        return '{}*'.format(typ)
    elif param == 'c*':
        return '{} const*'.format(typ)
    elif param == 'vi':
        return 'nsimd_{}_vi{}'.format(ext, typ[1:])
    elif param == 'v':
        return 'nsimd_{}_v{}'.format(ext, typ)
    elif param == 'vx2':
        return 'nsimd_{}_v{}x2'.format(ext, typ)
    elif param == 'vx3':
        return 'nsimd_{}_v{}x3'.format(ext, typ)
    elif param == 'vx4':
        return 'nsimd_{}_v{}x4'.format(ext, typ)
    elif param == 'l':
        return 'nsimd_{}_vl{}'.format(ext, typ)
    else:
        raise ValueError("Unknown param '{}'".format(param))

def get_one_type_pack(param, inout, N):
    if param == '_':
        return 'void'
    if param == 'p':
        return 'int'
    if param == '*':
        return 'T*'
    if param == 'c*':
        return 'T const*'
    if param == 's':
        return 'T'
    if param in ['v', 'vx2', 'vx3', 'vx4']:
        if inout == 0:
            return 'pack<T, {}, SimdExt> const&'.format(N)
        else:
            return 'pack<T, {}, SimdExt>'.format(N)
    if param == 'vi':
        if inout == 0:
            return 'pack<typename traits<T>::itype, {}, SimdExt> const&'. \
                   format(N)
        else:
            return 'pack<typename traits<T>::itype, {}, SimdExt>'.format(N)
    if param == 'l':
        if inout == 0:
            return 'packl<T, {}, SimdExt> const&'.format(N)
        else:
            return 'packl<T, {}, SimdExt>'.format(N)
    raise ValueError("Unknown param '{}'".format(param))

def get_one_type_generic_adv_cxx(param, T, N):
    if param == '_':
        return 'void'
    elif param == 'p':
        return 'int'
    elif param == '*':
        return '{}*'.format(T)
    elif param == 'c*':
        return '{} const*'.format(T)
    elif param == 's':
        return T
    elif param == 'v':
        return 'pack<{}, {}, SimdExt>'.format(T, N)
    elif param == 'vi':
        return 'pack<i{}, {}, SimdExt>'.format(T[1:], N)
    elif param == 'vx2':
        return 'packx2<{}, {}, SimdExt>'.format(T, N)
    elif param == 'vx3':
        return 'packx3<{}, {}, SimdExt>'.format(T, N)
    elif param == 'vx4':
        return 'packx4<{}, {}, SimdExt>'.format(T, N)
    elif param == 'l':
        return 'packl<{}, {}, SimdExt>'.format(T, N)
    else:
        raise ValueError('Unknown param: "{}"'.format(param))

def get_one_type_scalar(param, t):
    if param == '_':
        return 'void'
    elif param in ['p', 'l']:
        return 'int'
    elif param in ['s', 'v']:
        return t
    else:
        raise ValueError('Unknown param: "{}"'.format(param))

def get_first_discriminating_type(params):
    for i in range(len(params)):
        if params[i] in ['v', 'l', 'vx2', 'vx3', 'vx4']:
            return i
    return -1

# -----------------------------------------------------------------------------
# Formats

def pprint_lines(what):
    return '\n'.join(what)

def pprint_commas(what):
    return ', '.join(what)

def pprint_includes(what):
    return pprint_lines('#include {}'.format(i) for i in what)

# -----------------------------------------------------------------------------
# Function parsing signatures

def parse_signature(signature):
    l = signature.split(' ');
    name = l[1]
    if len(l) > 2:
        params = [l[0]] + l[2:]
    else:
        params = [l[0]]

    return (name, params)

# -----------------------------------------------------------------------------
# Load platforms

def get_platforms(opts):
    if opts.platforms_list != None:
        return opts.platforms_list
    ret = dict()
    path = opts.script_dir
    myprint(opts, 'Searching platforms in "{}"'.format(path))
    for mod_file in os.listdir(path):
        if mod_file[-3:] == '.py' and mod_file[0:9] == 'platform_':
            mod_name = mod_file[:-3]
            myprint(opts, 'Found new platform: {}'.format(mod_name[9:]))
            ret[mod_name[9:]] = __import__(mod_name)
    opts.platforms_list = ret
    return ret

# -----------------------------------------------------------------------------
# Find modules

def get_modules(opts):
    if opts.modules_list != None:
        return opts.modules_list
    ret = dict()
    # We have one module by directory
    path = os.path.join(opts.script_dir, 'modules')
    myprint(opts, 'Searching modules in "{}"'.format(path))
    for module_dir in os.listdir(path):
        if (not os.path.isdir(os.path.join(path, module_dir))) or \
           module_dir == '.' or module_dir == '..' or \
           (not os.path.exists(os.path.join(path, module_dir, 'hatch.py'))):
            continue
        myprint(opts, 'Found new module: {}'.format(module_dir))
        mod = __import__('modules.{}.hatch'.format(module_dir))
        ret[module_dir] = mod
    opts.modules_list = ret
    return ret

# -----------------------------------------------------------------------------
# Integer limits per type using macros defined in <limits.h> or <climits>

limits = {
    'i8':   {'min': 'NSIMD_I8_MIN',     'max': 'NSIMD_I8_MAX'   },
    'i16':  {'min': 'NSIMD_I16_MIN',    'max': 'NSIMD_I16_MAX'  },
    'i32':  {'min': 'NSIMD_I32_MIN',    'max': 'NSIMD_I32_MAX'  },
    'i64':  {'min': 'NSIMD_I64_MIN',    'max': 'NSIMD_I64_MAX'  },
    'u8':   {'min': 'NSIMD_U8_MIN',     'max': 'NSIMD_U8_MAX'   },
    'u16':  {'min': 'NSIMD_U16_MIN',    'max': 'NSIMD_U16_MAX'  },
    'u32':  {'min': 'NSIMD_U32_MIN',    'max': 'NSIMD_U32_MAX'  },
    'u64':  {'min': 'NSIMD_U64_MIN',    'max': 'NSIMD_U64_MAX'  }
  }

# -----------------------------------------------------------------------------
# Misc

def ext_from_lang(lang):
    return 'c' if lang == 'c_base' else 'cpp'

def nsimd_category(category):
    return 'nsimd_' + category

# ------------------------------------------------------------------------------
# Doc common

def to_filename(op_name):
    valid = string.ascii_letters + string.digits
    ret = ''
    for c in op_name:
        ret += '-' if c not in valid else c
    return ret

def get_markdown_dir(opts):
    return os.path.join(opts.script_dir, '..', 'doc', 'markdown')

def get_markdown_api_file(opts, name, module=''):
    root = get_markdown_dir(opts)
    op_name = to_filename(name)
    if module == '':
        return os.path.join(root, 'api_{}.md'.format(op_name))
    else:
        return os.path.join(root, 'module_{}_api_{}.md'.format(module, op_name))

def get_markdown_file(opts, name, module=''):
    root =  get_markdown_dir(opts)
    op_name = to_filename(name)
    if module == '':
        return os.path.join(root, '{}.md'.format(op_name))
    else:
        return os.path.join(root, 'module_{}_{}.md'.format(module, op_name))


================================================
FILE: egg/cuda.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common
import scalar

fmtspec = dict()

# -----------------------------------------------------------------------------
# NVIDIA doc on f16 can be found at
# https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF.html

def get_impl_f16(operator, totyp, typ):
    if operator.name == 'round_to_even':
        arch53_code = 'return hrint({in0});'.format(**fmtspec)
    elif operator.name in ['rec', 'rec8', 'rec11']:
        arch53_code = 'return hrcp({in0});'.format(**fmtspec)
    elif operator.name in ['rsqrt8', 'rsqrt11']:
        arch53_code = 'return hrsqrt({in0});'.format(**fmtspec)
    elif operator.name in ['fma', 'fms', 'fnma', 'fnms']:
        neg = '-' if operator.name in ['fnma, fnms'] else ''
        op = '-' if operator.name in ['fnms, fms'] else ''
        arch53_code = 'return __hfma({neg}{in0}, {in1}, {op}{in2});'. \
                      format(neg=neg, op=op, **fmtspec)
    elif operator.name in ['min', 'max']:
        intr = '__hlt' if operator.name == 'min' else '__hgt'
        arch53_code = '''if ({intr}) {{
                           return {in0};
                         }} else {{
                           return {in1};
                         }}'''.format(intr=intr, **fmtspec)
    elif operator.name in ['adds', 'subs']:
        arch53_code = 'return __h{op}({in0}, {in1});'. \
                      format(op=operator.name[:-1], **fmtspec)
    else:
        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                          for i in range(len(operator.params[1:]))])
        # Some f16 functions are not prefixed by `__`
        not_prefixed = ['ceil', 'floor', 'trunc', 'sqrt']
        if operator.name in not_prefixed:
            arch53_code = 'return h{}({});'.format(operator.name, args)
        else:
            arch53_code = 'return __h{}({});'.format(operator.name, args)
    args = ', '.join(['__half2float({{in{}}})'.format(i).format(**fmtspec) \
                      for i in range(len(operator.params[1:]))])
    if operator.params[0] == 'l':
        emul = 'return gpu_{}({});'.format(operator.name, args)
    else:
        emul = 'return __float2half(gpu_{}({}));'.format(operator.name, args)
    return '''#if __CUDA_ARCH__ >= 530
                {arch53_code}
              #else
                {emul}
              #endif'''.format(arch53_code=arch53_code, emul=emul)

# -----------------------------------------------------------------------------
# Reinterprets on CUDA have intrinsics

def reinterpret(totyp, typ):
    if typ == totyp:
        return 'return {in0};'.format(**fmtspec)
    cuda_typ = { 'i16': 'short',
                 'u16': 'ushort',
                 'f16': 'half',
                 'i32': 'int',
                 'u32': 'uint',
                 'f32': 'float',
                 'f64': 'double',
                 'i64': 'longlong' }
    if typ in cuda_typ and totyp in cuda_typ and \
       ((typ in common.ftypes and totyp in common.iutypes) or \
        (typ in common.iutypes and totyp in common.ftypes)):
        return 'return __{typ2}_as_{totyp2}({in0});'. \
               format(typ2=cuda_typ[typ], totyp2=cuda_typ[totyp], **fmtspec)
    else:
        return '''{totyp} ret;
                  memcpy((void *)&ret, (void *)&{in0}, sizeof({in0}));
                  return ret;'''.format(**fmtspec)

# -----------------------------------------------------------------------------

def get_impl(operator, totyp, typ):

    global fmtspec

    fmtspec = {
      'in0': common.in0,
      'in1': common.in1,
      'in2': common.in2,
      'typ': typ,
      'totyp': totyp,
      'typnbits': typ[1:]
    }

    # src operators
    if operator.src:
        cuda_ops = {
          'sin_u35': 'sin',
          'cos_u35': 'cos',
          'tan_u35': 'tan',
          'asin_u35': 'asin',
          'acos_u35': 'acos',
          'atan_u35': 'atan',
          'atan2_u35': 'atan2',
          'log_u35': 'log',
          'cbrt_u35': 'cbrt',
          'sin_u10': 'sin',
          'cos_u10': 'cos',
          'tan_u10': 'tan',
          'asin_u10': 'asin',
          'acos_u10': 'acos',
          'atan_u10': 'atan',
          'atan2_u10': 'atan2',
          'log_u10': 'log',
          'cbrt_u10': 'cbrt',
          'exp_u10': 'exp',
          'pow_u10': 'pow',
          'sinh_u10': 'sinh',
          'cosh_u10': 'cosh',
          'tanh_u10': 'tanh',
          'sinh_u35': 'sinh',
          'cosh_u35': 'cosh',
          'tanh_u35': 'tanh',
          'asinh_u10': 'asinh',
          'acosh_u10': 'acosh',
          'atanh_u10': 'atanh',
          'exp2_u10': 'exp2',
          'exp2_u35': 'exp2',
          'exp10_u10': 'exp10',
          'exp10_u35': 'exp10',
          'expm1_u10': 'expm1',
          'log10_u10': 'log10',
          'log2_u10': 'log2',
          'log2_u35': 'log2',
          'log1p_u10': 'log1p',
          'sinpi_u05': 'sinpi',
          'cospi_u05': 'cospi',
          'hypot_u05': 'hypot',
          'hypot_u35': 'hypot',
          'remainder': 'remainder',
          'fmod': 'fmod',
          'lgamma_u10': 'lgamma',
          'tgamma_u10': 'tgamma',
          'erf_u10': 'erf',
          'erfc_u15': 'erfc'
        }
        args = common.get_args(len(operator.params[1:]))
        cuda_op = cuda_ops[operator.name]
        if typ == 'f16':
            # For f16 CUDA offers only a few operator
            if cuda_op in ['cos', 'exp', 'exp10', 'exp2', 'log', 'log10',
                           'log2', 'sin']:
                return '''#if __CUDA_ARCH__ >= 530
                            return h{}({});
                          #else
                            return __float2half(gpu_{}(__half2float({})));
                          #endif'''.format(cuda_op, args, operator.name, args)
            else:
                args = ', '.join('__half2float({})'.format(common.get_arg(i)) \
                                 for i in range(len(operator.params[1:])))
                return 'return __float2half(gpu_{}({}));'. \
                       format(operator.name, args)
        elif typ == 'f32':
            return 'return {}f({});'.format(cuda_op, args)
        else:
            return 'return {}({});'.format(cuda_op, args)

    # bool first, no special treatment for f16's
    bool_operators = {
        'andl': 'return {in0} && {in1};',
        'orl': 'return {in0} || {in1};',
        'xorl': 'return {in0} ^ {in1};',
        'andnotl': 'return {in0} && (!{in1});',
        'notl': 'return !{in0};',
    }
    if operator.name in bool_operators:
        return bool_operators[operator.name].format(**fmtspec)
    # infix operators that needs type punning, no special treatment for f16's
    def pun_code(code, arity, typ):
        if typ in common.utypes:
            return 'return ' + code.format(**fmtspec) + ';'
        utyp = common.bitfield_type[typ]
        to_utyp = '\n'.join(
                  ['''{utyp} buf{i};
                      memcpy(&buf{i}, &{{in{i}}}, sizeof({{in{i}}}));'''. \
                      format(i=i, utyp=utyp).format(**fmtspec) \
                      for i in range(arity)])
        return '''{to_utyp}
                  {utyp} tmp = {code};
                  {typ} ret;
                  memcpy(&ret, &tmp, sizeof(tmp));
                  return ret;'''.format(to_utyp=to_utyp, utyp=utyp, typ=typ,
                                        code=code.format(in0='buf0',
                                                         in1='buf1'))
    pun_operators = {
        'orb': lambda: pun_code('{in0} | {in1}', 2, typ),
        'andb': lambda: pun_code('{in0} & {in1}', 2, typ),
        'andnotb': lambda: pun_code('{in0} & (~{in1})', 2, typ),
        'notb': lambda: pun_code('~{in0}', 1, typ),
        'xorb': lambda: pun_code('{in0} ^ {in1}', 2, typ),
    }
    if operator.name in pun_operators:
        return pun_operators[operator.name]()
    # reinterpret
    if operator.name == 'reinterpret':
        return reinterpret(totyp, typ)
    # cvt
    if operator.name == 'cvt':
        return 'return ({totyp}){in0};'.format(**fmtspec)
    # to_mask
    if operator.name == 'to_mask':
        if typ in common.utypes:
            return 'return ({typ})({in0} ? -1 : 0);'.format(**fmtspec)
        return 'return gpu_reinterpret({typ}(), ({utyp})({in0} ? -1 : 0));'. \
               format(utyp=common.bitfield_type[typ], **fmtspec)
    # to_logical
    if operator.name == 'to_logical':
        if typ in common.iutypes:
            return 'return {in0} == ({typ})0 ? false : true;'.format(**fmtspec)
        return '''return gpu_reinterpret({utyp}(), {in0}) == ({utyp})0
                         ? false : true ;'''. \
                         format(utyp=common.bitfield_type[typ], **fmtspec)
    # for all other operators, f16 has a special treatment
    if typ == 'f16':
        return get_impl_f16(operator, totyp, typ)
    # then deal with f32's operators
    # first infix operators
    c_operators = {
        'add': 'return ({typ})({in0} + {in1});',
        'sub': 'return ({typ})({in0} - {in1});',
        'mul': 'return ({typ})({in0} * {in1});',
        'div': 'return ({typ})({in0} / {in1});',
        'neg': 'return ({typ})(-{in0});',
        'rec': 'return 1.0{f} / {in0};',
        'rec8': 'return 1.0{f} / {in0};',
        'rec11': 'return 1.0{f} / {in0};',
        'lt': 'return {in0} < {in1};',
        'gt': 'return {in0} > {in1};',
        'le': 'return {in0} <= {in1};',
        'ge': 'return {in0} >= {in1};',
        'ne': 'return {in0} != {in1};',
        'eq': 'return {in0} == {in1};',
        'shl': 'return ({typ})({in0} << {in1});',
    }
    if operator.name in c_operators:
        return c_operators[operator.name]. \
               format(f='f' if typ == 'f32' else '', **fmtspec)
    # right shifts
    if operator.name in ['shr', 'shra']:
        if typ in common.utypes:
            return 'return ({typ})({in0} >> {in1});'.format(**fmtspec)
        if operator.name == 'shr':
            return \
            '''return gpu_reinterpret({typ}(), ({utyp})(
                          gpu_reinterpret({utyp}(), {in0}) >> {in1}));'''. \
                          format(utyp=common.bitfield_type[typ], **fmtspec)
        # getting here means shra on signed types
        return \
        '''if ({in1} == 0) {{
             return {in0};
           }}
           if ({in0} >= 0) {{
             return gpu_reinterpret({typ}(), ({utyp})(
                        gpu_reinterpret({utyp}(), {in0}) >> {in1}));
           }} else {{
             {utyp} mask = ({utyp})((({utyp})-1) << ({typnbits} - {in1}));
             return gpu_reinterpret({typ}(), (({utyp})(mask |
                      ({utyp})(gpu_reinterpret({utyp}(), {in0}) >> {in1}))));
           }}'''.format(utyp=common.bitfield_type[typ], **fmtspec)
    # adds
    if operator.name == 'adds':
        if typ in common.ftypes:
            return c_operators['add'].format(**fmtspec)
        else:
            return scalar.get_impl(operator, totyp, typ)
    # subs
    if operator.name == 'subs':
        if typ in common.ftypes:
            return c_operators['sub'].format(**fmtspec)
        elif typ in common.utypes:
            return scalar.get_impl(operator, totyp, typ)
        else:
            return 'return nsimd::gpu_adds({in0}, ({typ})(-{in1}));'. \
                   format(**fmtspec)
    # fma's
    if operator.name in ['fma', 'fms', 'fnma', 'fnms']:
        neg = '-' if operator.name in ['fnma, fnms'] else ''
        op = '-' if operator.name in ['fnms, fms'] else ''
        if typ in common.ftypes:
            return 'return fma{f}({neg}{in0}, {in1}, {op}{in2});'. \
                   format(f='f' if typ == 'f32' else '', neg=neg, op=op,
                          **fmtspec)
        else:
            return 'return {neg}{in0} * {in1} + ({op}{in2});'. \
                   format(neg=neg, op=op, **fmtspec)
    # other operators
    if typ in common.iutypes:
        if operator.name in ['round_to_even', 'ceil', 'floor', 'trunc']:
            return 'return {in0};'.format(**fmtspec)
        elif operator.name == 'min':
            return 'return ({typ})({in0} < {in1} ? {in0} : {in1});'. \
                   format(**fmtspec)
        elif operator.name == 'max':
            return 'return ({typ})({in0} > {in1} ? {in0} : {in1});'. \
                   format(**fmtspec)
        elif operator.name == 'abs':
            return 'return ({typ})({in0} > 0 ? {in0} : -{in0});'. \
                   format(**fmtspec)
    else:
        cuda_name = {
            'round_to_even': 'rint',
            'min': 'fmin',
            'max': 'fmax',
            'abs': 'fabs',
            'ceil': 'ceil',
            'floor': 'floor',
            'trunc': 'trunc',
            'rsqrt8': 'rsqrt',
            'rsqrt11': 'rsqrt'
        }
        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                          for i in range(len(operator.args))])
        return 'return {name}{f}({args});'. \
               format(name=cuda_name[operator.name] \
                      if operator.name in cuda_name else operator.name,
                      f='f' if typ == 'f32' else '', args=args)


================================================
FILE: egg/experiments/gen_sleef_operators.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os

script_dir = os.path.dirname(os.path.realpath(__file__))
sleef_dir = os.path.join(script_dir, '..', '..', '_deps-sleef')
sleef_version = '3.5.1'

funcproto = os.path.join(sleef_dir, 'sleef-{}'.format(sleef_version),
                         'src', 'libm', 'funcproto.h')

ulp_suffix = {
    '0' : '',
    '1' : '_u1',
    '2' : '_u05',
    '3' : '_u35',
    '4' : '_u15',
    '5' : '_u3500'
}

func_type = {
    '0' : 'v {} v',
    '1' : 'v {} v v',
    '2' : 'vx2 {} v',
    '3' : 'v {} v p',
    '4' : 'v {} v',
    '5' : 'v {} v v v',
    '6' : 'vx2 {} v',
    '7' : 'p {} p',
    '8' : '* {} p'
}

props = {
    'cos' : ['cosine', 'DocTrigo', 'R'],
    'sin' : ['sine', 'DocTrigo', 'R'],
    'fastcos' : ['cosine', 'DocTrigo', 'R'],
    'fastsin' : ['sine', 'DocTrigo', 'R'],
    'cospi' : ['cosine of multiple of pi argument', 'DocTrigo', 'R'],
    'sinpi' : ['sine of multiple of pi argument', 'DocTrigo', 'R'],
    'tan' : ['tangent', 'DocTrigo', 'R\{(z+0.5)*pi}'],
    'acos' : ['arc cosine', 'DocTrigo', '(-1,1)'],
    'asin' : ['arc sine', 'DocTrigo', '(-1,1)'],
    'atan' : ['arc tangent', 'DocTrigo', 'R'],
    'atan2' : ['arc tangent', 'DocTrigo', 'RxR'],

    'log' : ['natural logarithmic', 'DocExpLog', '(0,Inf)'],
    'log2' : ['base-2 logarithmic', 'DocExpLog', '(0,Inf)'],
    'log10' : ['base-10 logarithmic', 'DocExpLog', '(0,Inf)'],
    'log1p' : ['logarithm of one plus argument', 'DocExpLog', '(-1,Inf)'],
    'exp' : ['exponential', 'DocExpLog', 'R'],
    'exp2' : ['base-2 exponential', 'DocExpLog', 'R'],
    'exp10' : ['base-10 exponential', 'DocExpLog', 'R'],
    'expm1' : ['exponential minus 1', 'DocExpLog', 'R'],
    'pow' : ['power', 'DocExpLog', 'RxR'],
    'fastpow' : ['power', 'DocExpLog', 'RxR'],

    'cbrt' : ['cubic root', 'DocBasicArithmetic', 'R'],
    'hypot' : ['hypotenuse', 'DocBasicArithmetic', 'RxR'],

    'sinh': ['hyperbolic sine', 'DocHyper', 'R'],
    'cosh': ['hyperbolic cosine', 'DocHyper', 'R'],
    'tanh': ['hyperbolic tangent', 'DocHyper', 'R'],
    'asinh': ['hyperbolic arc sine', 'DocHyper', 'R'],
    'acosh': ['hyperbolic arc cosine', 'DocHyper', '(1,Inf)'],
    'atanh': ['hyperbolic arc tangent', 'DocHyper', '(-1,1)'],

    'lgamma' : ['log gamma', 'DocMisc', 'R\{-n}'],
    'tgamma' : ['gamma', 'DocMisc', 'R\{-n}'],
    'erf' : ['error function', 'DocMisc', 'R'],
    'erfc' : ['complementary error function', 'DocMisc', 'R']
}

with open(funcproto, 'r') as fin:
    for line in fin:
        if not (line.find('{') != -1 and line.find('}') != -1):
            continue
        items = [item.strip() for item in line.strip(' \n\r{},').split(',')]
        items[0] = items[0].strip('"')
        if items[0] == 'NULL':
            break
        if items[0] not in props:
            continue
        name = items[0] + '_u' + items[1]
        symbol = 'nsimd_sleef_{}'.format(name)
        prop = props[items[0]]
        print('Class {}{}(SrcOperator):'. \
              format(name[0].upper(), name[1:]))
        print('  full_name = \'{}\''.format(prop[0]))
        print('  signature = \'{}\''.format(func_type[items[3]]) \
                                    .format(name))
        print('  sleef_symbol_prefix = \'{}\''.format(symbol))
        print('  domain = Domain(\'{}\')'.format(prop[2]))
        print('  categories = [{}]'.format(prop[1]))
        print('  desc = \'Compute the {} of its argument{} with ' \
                 'a precision of {} ulps. For more informations visit ' \
                 '<https://sleef.org/purec.xhtml>.\''.format(prop[0],
                 's' if items[3] in ['1', '3', '5'] else '',
                 float(items[1]) / 10.0))
        print('')


================================================
FILE: egg/experiments/round-ppc.c
================================================
#include <altivec.h>
#include <stdio.h>

void pp(const char *prefix, FILE *out, float buf[4]) {
  fputs(prefix, out);
  fputc('{', out);
  for (int i = 0; i < 4; i++) {
    fprintf(out, " %f", (double)buf[i]);
  }
  fputs(" }\n", out);
}

int main() {
  float res[4];

  float buf[4];
  buf[0] = -1.5f;
  buf[1] = -0.5f;
  buf[2] = 0.5f;
  buf[3] = 1.5f;
  __vector float v = *(__vector float *)buf;


  pp("   buf = ", stdout, buf);


  *(__vector float *)res = vec_round(v);
  pp(" round = ", stdout, res);

  *(__vector float *)res = vec_rint(v);
  pp("  rint = ", stdout, res);

  *(__vector float *)res = vec_roundc(v);
  pp("roundc = ", stdout, res);

  return 0;
}


================================================
FILE: egg/experiments/upcvt-sve.c
================================================
#include <stdio.h>
#include <arm_sve.h>

// armclang -march=armv8+sve egg/experiments/upcvt-sve.c -o ../build/a.out

// ---

int len32() {
  return (int)svcntp_b32(svptrue_b32(), svptrue_b32());
}

void print32(FILE *out, const char *var, svfloat32_t a) {
  float buf[2048];
  svst1_f32(svptrue_b32(), buf, a);
  fprintf(out, "%s = ", var);
  for (int i = 0; i < len32(); i++) {
    if (i > 0) {
      fputs(", ", out);
    }
    fprintf(out, "%f", (double)buf[i]);
  }
  fputc('\n', stdout);
}

svfloat32_t iota32(float i0) {
  float buf[2048];
  for (int i = 0; i < len32(); i++) {
    buf[i] = i0 + (float)i;
  }
  return svld1(svptrue_b32(), buf);
}

// ---

int len64() {
  return (int)svcntp_b64(svptrue_b64(), svptrue_b64());
}

void print64(FILE *out, const char *var, svfloat64_t a) {
  double buf[2048];
  svst1_f64(svptrue_b64(), buf, a);
  fprintf(out, "%s = ", var);
  for (int i = 0; i < len64(); i++) {
    if (i > 0) {
      fputs(", ", out);
    }
    fprintf(out, "%f", buf[i]);
  }
  fputc('\n', stdout);
}


// ---

int main() {
  svfloat32_t a = iota32(0.0f);
  svfloat32_t b = iota32(8.0f);
  svfloat64_t c = svcvt_f64_f32_z(svptrue_b32(), svzip1_f32(a, a));
  print32(stdout, "a ", a);
  print32(stdout, "aa", svzip1_f32(a, a));
  print64(stdout, "c ", c);
  return 0;
}


================================================
FILE: egg/gen_adv_c_api.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common
import os
import operators

# -----------------------------------------------------------------------------
# Construct C11 types

def get_c11_types(simd_ext):
    ret = ''
    for se in common.simds_deps[simd_ext]:
        ret += '\n\n'.join([
               '''typedef NSIMD_STRUCT nsimd_pack_{typ}_{se} {{
                    nsimd_{se}_v{typ} v;
                  }} nsimd_pack_{typ}_{se};

                  NSIMD_INLINE nsimd_pack_{typ}_{se}
                  nsimd_make_pack_{typ}_{se}(nsimd_{se}_v{typ} v) {{
                    return (nsimd_pack_{typ}_{se}){{ v }};
                  }}'''.format(typ=typ, se=se) for typ in common.types])
        ret += '\n\n'
        ret += '\n\n'.join([
               '''typedef NSIMD_STRUCT nsimd_packl_{typ}_{se} {{
                    nsimd_{se}_vl{typ} v;
                  }} nsimd_packl_{typ}_{se};

                  NSIMD_INLINE nsimd_packl_{typ}_{se}
                  nsimd_make_packl_{typ}_{se}(nsimd_{se}_vl{typ} v) {{
                    return (nsimd_packl_{typ}_{se}){{ v }};
                  }}'''.format(typ=typ, se=se) for typ in common.types])
        for deg in [2, 3, 4]:
            vs = ', '.join(['v{}'.format(i) for i in range(deg)])
            avs = ', '.join(['{{a0.v{}}}'.format(i) for i in range(deg)])
            ret += '\n\n'
            ret += '\n\n'.join([
                   '''typedef NSIMD_STRUCT nsimd_packx{deg}_{typ}_{se} {{
                        nsimd_pack_{typ}_{se} {vs};
                      }} nsimd_packx{deg}_{typ}_{se};

                      NSIMD_INLINE nsimd_packx{deg}_{typ}_{se}
                      nsimd_make_packx{deg}_{typ}_{se}
                      (nsimd_{se}_v{typ}x{deg} a0) {{
                        return (nsimd_packx{deg}_{typ}_{se}){{ {avs} }};
                      }}                      '''. \
                      format(typ=typ, se=se, vs=vs, deg=deg, avs=avs) \
                      for typ in common.types])

    ret += '\n\n'
    ret += '#define nsimd_make_pack(var, func) ' \
           '_Generic(var, \\\n'
    ret += '\n'.join([
           'nsimd_pack_{typ}_{se}: nsimd_make_pack_{typ}_{se}, \\'. \
           format(typ=typ, se=se) for typ in common.types \
                                  for se in common.simds_deps[simd_ext]])
    ret += '\n'
    ret += '\n'.join([
           'nsimd_packl_{typ}_{se}: nsimd_make_packl_{typ}_{se}, \\'. \
           format(typ=typ, se=se) for typ in common.types \
                                  for se in common.simds_deps[simd_ext]])
    ret += '\n'
    ret += '\n'.join([
           'nsimd_packx{d}_{typ}_{se}: nsimd_make_packx{d}_{typ}_{se}, \\'. \
           format(typ=typ, se=se, d=d) for typ in common.types \
                                       for d in [2, 3, 4] \
                                       for se in common.simds_deps[simd_ext]])
    ret += '\ndefault: nsimd_c11_type_unsupported)(func)'

    ret += '\n\n'
    ret += '\n'.join([
           'typedef nsimd_pack_{typ}_{simd_ext} nsimd_pack_{typ};'. \
            format(typ=typ, simd_ext=simd_ext) for typ in common.types])
    ret += '\n\n'
    ret += '\n'.join([
           'typedef nsimd_packl_{typ}_{simd_ext} nsimd_packl_{typ};'. \
            format(typ=typ, simd_ext=simd_ext) for typ in common.types])
    ret += '\n\n'
    ret += '\n'.join([
           'typedef nsimd_packx{d}_{typ}_{simd_ext} nsimd_packx{d}_{typ};'. \
            format(typ=typ, simd_ext=simd_ext, d=d) \
            for typ in common.types for d in [2, 3, 4]])

    ret += '\n\n'
    ret += '#define nsimd_c11_pack(var) _Generic((var), \\\n'
    ret += '\n'.join([
           'nsimd_packl_{typ}_{se}: ' \
           '((nsimd_pack_{typ}_{se} (*)())NULL)(), \\'. \
           format(typ=typ, se=se) for typ in common.types \
                                  for se in common.simds_deps[simd_ext]])
    ret += '\ndefault: NULL)'

    ret += '\n\n'
    ret += '#define nsimd_c11_packl(var) _Generic((var), \\\n'
    ret += '\n'.join([
           'nsimd_pack_{typ}_{se}: ' \
           '((nsimd_packl_{typ}_{se} (*)())NULL)(), \\'. \
           format(typ=typ, se=se) for typ in common.types \
                                  for se in common.simds_deps[simd_ext]])
    ret += '\ndefault: NULL)'

    ret += '\n\n'
    ret += '#define nsimd_c11_packx2(var) _Generic((var), \\\n'
    ret += '\n'.join([
           'nsimd_pack_{typ}_{se}: ' \
           '((nsimd_packx2_{typ}_{se} (*)())NULL)(), \\'. \
           format(typ=typ, se=se) for typ in common.types \
                                  for se in common.simds_deps[simd_ext]])
    ret += '\ndefault: NULL)'

    return ret

# -----------------------------------------------------------------------------
# Construct C11 overloads

def get_c11_overloads(op, simd_ext):
    if common.get_first_discriminating_type(op.params) == -1:
        # Only the len operator should go here
        assert op.name == 'len'
        ret = '\n\n'.join([
        '''#define NSIMD_C11_LEN_nsimd_pack_{typ}_{se}() \\
                   nsimd_len_{se}_{typ}()

           #define NSIMD_C11_LEN_nsimd_packl_{typ}_{se}() \\
                   nsimd_len_{se}_{typ}()

           #define NSIMD_C11_LEN_nsimd_packx2_{typ}_{se}() \\
                   (2 * nsimd_len_{se}_{typ}())

           #define NSIMD_C11_LEN_nsimd_packx3_{typ}_{se}() \\
                   (3 * nsimd_len_{se}_{typ}())

           #define NSIMD_C11_LEN_nsimd_packx4_{typ}_{se}() \\
                   (4 * nsimd_len_{se}_{typ}())'''.format(typ=typ, se=se) \
                   for typ in op.types for se in common.simds_deps[simd_ext]])

        ret += '\n\n'
        ret += '\n\n'.join([
        '''#define NSIMD_C11_LEN_nsimd_pack_{typ}() \\
                   nsimd_len_{simd_ext}_{typ}()

           #define NSIMD_C11_LEN_nsimd_packl_{typ}() \\
                   nsimd_len_{simd_ext}_{typ}()

           #define NSIMD_C11_LEN_nsimd_packx2_{typ}() \\
                   (2 * nsimd_len_{simd_ext}_{typ}())

           #define NSIMD_C11_LEN_nsimd_packx3_{typ}() \\
                   (3 * nsimd_len_{simd_ext}_{typ}())

           #define NSIMD_C11_LEN_nsimd_packx4_{typ}() \\
                   (4 * nsimd_len_{simd_ext}_{typ}())'''. \
                   format(typ=typ, simd_ext=simd_ext) for typ in common.types])
        ret += '\n\n'
        ret += '#define nsimd_len(type) \\\n' \
               'NSIMD_PP_CAT_2(NSIMD_C11_LEN_, type)()\n\n'
        return ret

    def get_c11_arg(param, name):
        if param in ['*', 'c*', 's', 'p']:
            return name
        elif param in ['v', 'l', 'vi']:
            return '({}).v'.format(name)

    args = op.params[1:]
    i0 = common.get_first_discriminating_type(args)
    if i0 == -1:
        if op.params[0] == 'v':
            pack = 'pack'
        elif op.params[0] == 'l':
            pack = 'packl'
        elif op.params[0] == 'vx2':
            pack = 'packx2'
        elif op.params[0] == 'vx3':
            pack = 'packx3'
        elif op.params[0] == 'vx4':
            pack = 'packx4'
        macro_args = ', '.join(['a{}'.format(i) for i in range(len(args))])
        ret = '\n\n'.join([
        '''#define NSIMD_C11_{OP_NAME}_nsimd_{pack}_{typ}_{se}({macro_args}) \\
                     nsimd_make_{pack}_{typ}_{se}( \\
                       nsimd_{op_name}_{se}_{typ}({macro_args}))'''. \
                       format(OP_NAME=op.name.upper(), se=se,
                              macro_args=macro_args,
                              op_name=op.name, typ=typ, pack=pack) \
                              for typ in op.types \
                              for se in common.simds_deps[simd_ext]])
        ret += '\n\n'
        ret += '\n\n'.join([
        '''#define NSIMD_C11_{OP_NAME}_nsimd_{pack}_{typ}({macro_args}) \\
                     nsimd_make_{pack}_{typ}_{simd_ext}( \\
                       nsimd_{op_name}_{simd_ext}_{typ}({macro_args}))'''. \
                       format(OP_NAME=op.name.upper(), simd_ext=simd_ext,
                              macro_args=macro_args, op_name=op.name, typ=typ,
                              pack=pack) for typ in op.types])
        ret += '\n\n'
        type_args = ', '.join(['type'] + \
                              ['a{}'.format(i) for i in range(len(args))])
        call_args = ', '.join([get_c11_arg(args[i], 'a{}'.format(i)) \
                               for i in range(len(args))])
        ret += '\n\n#define nsimd_{op_name}({type_args})' \
               ' NSIMD_PP_CAT_2(NSIMD_C11_{OP_NAME}_, type)({call_args})'. \
               format(op_name=op.name, OP_NAME=op.name.upper(),
                      call_args=call_args, type_args=type_args)
        return ret

    # Getting here means that i0 >= 0 i.e. that overloads can be determined
    # by argument i0 of the operator which is in ['v', 'l', 'vx2', 'vx3',
    # 'vx4']

    macro_args = ['a{}'.format(i) for i in range(len(args))]
    call_args = ', '.join([get_c11_arg(args[i], 'a{}'.format(i)) \
                           for i in range(len(args))])
    if not op.closed:
        macro_args = ['to_type'] + macro_args
    macro_args = ', '.join(macro_args)

    if op.params[0] in ['v', 'l', 'vx2', 'vx3', 'vx4']:
        if not op.closed:
            ret = '#define nsimd_{}({}) ' \
                  'nsimd_make_pack((((to_type (*)())NULL)()), ' \
                  '_Generic(({}), \\\n'. \
                  format(op.name, macro_args, 'a{}'.format(i0))
        else:
            if op.params[0] != args[i0]:
                if op.params[0] == 'v':
                    ctrl_expr = 'nsimd_c11_pack(a{})'.format(i0)
                elif op.params[0] == 'l':
                    ctrl_expr = 'nsimd_c11_packl(a{})'.format(i0)
                elif op.params[0] == 'vx2':
                    ctrl_expr = 'nsimd_c11_packx2(a{})'.format(i0)
            else:
                ctrl_expr = 'a{}'.format(i0)
            ret = '#define nsimd_{}({}) ' \
                  'nsimd_make_pack({}, _Generic(({}), \\\n'. \
                  format(op.name, macro_args, ctrl_expr, 'a{}'.format(i0))
    else:
        ret = '#define nsimd_{}({}) _Generic(({}), \\\n'. \
              format(op.name, macro_args, 'a{}'.format(i0))

    suf = { 'v': '', 'l': 'l', 'vx2': 'x2', 'vx3': 'x3', 'vx4': 'x4'}

    arg = args[i0]
    typ_fmt = 'nsimd_pack{}_{{}}_{{}}'.format(suf[arg])

    for se in common.simds_deps[simd_ext]:
        for typ in op.types:
            ret += typ_fmt.format(typ, se) + ': '
            if op.closed:
                ret += 'nsimd_{}_{}_{}, \\\n'.format(op.name, se, typ)
                continue
            ret += '_Generic(((to_type (*)())NULL)(), \\\n'
            for to_typ in common.get_output_types(typ, op.output_to):
                to_pack = 'nsimd_pack{}_{}_{}'. \
                          format(suf[op.params[0]], to_typ, se)
                ret += '  {}: nsimd_{}_{}_{}_{}, \\\n'. \
                       format(to_pack, op.name, se, to_typ, typ)
            ret += '  default: nsimd_c11_type_unsupported), \\\n'

    ret += 'default: nsimd_c11_type_unsupported)({})'.format(call_args)
    if op.params[0] in ['v', 'l', 'vx2', 'vx3', 'vx4']:
        ret += ')'
    return ret

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Generating advanced C API (requires C11)')
    filename = os.path.join(opts.include_dir, 'c_adv_api_functions.h')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_C_ADV_API_FUNCTIONS_H
                     #define NSIMD_C_ADV_API_FUNCTIONS_H

                     #include <nsimd/nsimd.h>

                     ''')

        for simd_ext in common.simds:
            out.write('''{hbar}
                         {hbar}
                         {hbar}

                         /* {SIMD_EXT} */

                         {hbar}
                         {hbar}
                         {hbar}

                         #ifdef NSIMD_{SIMD_EXT}

                         {types}

                         '''.format(hbar=common.hbar,
                                    types=get_c11_types(simd_ext),
                                    SIMD_EXT=simd_ext.upper()))

            for op_name, operator in operators.operators.items():
                out.write('/* {} */\n\n{}\n\n'. \
                          format(op_name, get_c11_overloads(operator,
                                                            simd_ext)))

            out.write('\n\n#endif')

        out.write('\n\n{}\n\n#endif\n'.format(common.hbar))


================================================
FILE: egg/gen_adv_cxx_api.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import operators
import common
import os
from datetime import date
import sys

# -----------------------------------------------------------------------------
# Actual implementation

def get_cxx_advanced_generic(operator):
    def get_pack(param):
        if param in ['v', 'vi']:
            return 'pack'
        elif param == 'l':
            return 'pack'
        else:
            return 'pack{}'.format(param[1:])
    args_list = common.enum(operator.params[1:])
    inter = [i for i in ['v', 'vi', 'l', 'vx1', 'vx2', 'vx3', 'vx4'] \
             if i in operator.params[1:]]
    need_tmpl_pack = get_pack(operator.params[0]) if inter == [] else None

    # Compute parameters passed to the base C++ API functions
    def var(arg, N):
        member = 'car' if N == '1' else 'cdr'
        if arg[1] in ['vi', 'v', 'l']:
            return 'a{}.{}'.format(arg[0], member)
        elif (arg[1] in ['*', 'c*']) and N != '1':
            return 'a{} + len_'.format(arg[0])
        else:
            return 'a{}'.format(arg[0])
    vars1 = [var(i, '1') for i in args_list] + ['T()'] + \
            (['typename ToPackType::value_type()'] if not operator.closed \
             else []) + ['SimdExt()']
    varsN = [var(i, 'N') for i in args_list]
    other_varsN = ', '.join(['a{}'.format(i[0]) for i in args_list])
    if other_varsN != '':
        other_varsN = ', ' + other_varsN
    if not operator.closed:
        varsN = ['typename ToPackType::value_type()'] + varsN
    if need_tmpl_pack != None:
        varsN = ['{}<T, N - 1, SimdExt>()'.format(need_tmpl_pack)] + varsN
    vars1 = ', '.join(vars1)
    varsN = ', '.join(varsN)

    # Compute return type
    ret1 = 'ToPackType' if not operator.closed \
           else common.get_one_type_generic_adv_cxx(operator.params[0],
                                                    'T', '1')
    retN = 'ToPackType' if not operator.closed \
           else common.get_one_type_generic_adv_cxx(operator.params[0],
                                                            'T', 'N')

    # Dump C++
    if operator.params[0] in ['v', 'vi', 'l']:
        return_ret = 'return ret;'
        ret_car = 'ret.car = '
        ret_cdr = 'ret.cdr = '
        post_car = ''
        post_cdr = ''
        pack1_ret = '{} ret;'.format(ret1)
        packN_ret = '{} ret;'.format(retN)
    elif operator.params[0] in ['vx1', 'vx2', 'vx3', 'vx4']:
        num = operator.params[0][-1:]
        return_ret = 'return ret;'
        if operator.closed:
            ret_car = \
                'typename simd_traits<T, SimdExt>::simd_vectorx{} car = '. \
                format(num)
        else:
            ret_car = \
                '''typename simd_traits<typename ToPackType::value_type,
                       SimdExt>::simd_vectorx{} car = '''.format(num)
        ret_cdr = 'packx{}<T, N - 1, SimdExt> cdr = '.format(num)
        post_car = '; ret.set_car({})'.format(', '.join( \
            ['car.v{}'.format(i) for i in range(0, int(num))]))
        post_cdr = '; ret.set_cdr({})'.format(', '.join( \
            ['cdr.v{}'.format(i) for i in range(0, int(num))]))
        pack1_ret = '{} ret;'.format(ret1)
        packN_ret = '{} ret;'.format(retN)
    else:
        return_ret = ''
        ret_car = ''
        ret_cdr = ''
        post_car = ''
        post_cdr = ''
        pack1_ret = ''
        packN_ret = ''
    if '*' in operator.params[1:] or 'c*' in operator.params[1:]:
        # store*[au] does not contain any packx* argument, therefore the offset
        # cannot be correctly computed
        if operator.name in ['store2u', 'store2a']:
            multiplier = '2 * '
        elif operator.name in ['store3u', 'store3a']:
            multiplier = '3 * '
        elif operator.name in ['store4u', 'store4a']:
            multiplier = '4 * '
        else:
            multiplier = ''
        int_len = 'int len_ = {}len({}<T, 1, SimdExt>());'. \
                  format(multiplier, get_pack(inter[0]) if inter != [] \
                                     else need_tmpl_pack)
    else:
        int_len = ''

    sig = operator.get_generic_signature('cxx_adv')
    for k in sig:
        sig[k] = sig[k][:-1] # remove trailing ';'

    tmpl = '''{{sig1}} {{{{{pack1_ret}
                {ret_car}{name}({vars1}){post_car};
              {return_ret}}}}}

              {{sigN}} {{{{{packN_ret}{int_len}
                {ret_car}{name}({vars1}){post_car};
                {ret_cdr}{{cxx_name}}({varsN}){post_cdr};
              {return_ret}}}}}'''. \
	      format(pack1_ret=pack1_ret, ret_car=ret_car, name=operator.name,
	             vars1=vars1, return_ret=return_ret, retN=retN,
	             packN_ret=packN_ret, int_len=int_len, ret_cdr=ret_cdr,
	             varsN=varsN, post_car=post_car, post_cdr=post_cdr)

    ret = ''
    if operator.cxx_operator:
        ret += tmpl.format(cxx_name='operator'+operator.cxx_operator,
                           sig1=sig['op1'], sigN=sig['opN']) + '\n\n'
    ret += tmpl.format(cxx_name=operator.name,
                       sig1=sig['1'], sigN=sig['N']) + '\n\n'

    if not operator.closed:
        return_ins = 'return ' if operator.params[0] != '_' else ''
        ret += '\n\n'
        ret += '''{sig} {{
                    {return_ins}{cxx_name}(ToPackType(){other_varsN});
                  }}'''. \
                  format(cxx_name=operator.name, sig=sig['dispatch'],
                         other_varsN=other_varsN, return_ins=return_ins)
    if need_tmpl_pack != None:
        ret += '\n\n'
        ret += '''{sig} {{
                    return {cxx_name}(SimdVector(){other_varsN});
                  }}'''. \
                  format(sig=sig['dispatch'], cxx_name=operator.name,
                         other_varsN=other_varsN)
    return ret

# -----------------------------------------------------------------------------
# Generate assignments operator (+=, *=, &=, ...)
def gen_assignment_operators(op):
    #return '''{sig} {{ }}'''
    return ''

# -----------------------------------------------------------------------------
# Generate advanced C++ API

def doit(opts):
    common.myprint(opts, 'Generating advanced C++ API')
    filename = os.path.join(opts.include_dir, 'cxx_adv_api_functions.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_CXX_ADV_API_FUNCTIONS_HPP
                     #define NSIMD_CXX_ADV_API_FUNCTIONS_HPP

                     namespace nsimd {

                     ''')

        for op_name, operator in operators.operators.items():
            if not operator.autogen_cxx_adv:
                continue

            out.write('''{hbar}

                         {code}

                         '''.format(hbar=common.hbar,
                                    code=get_cxx_advanced_generic(operator)))

            if operator.cxx_operator and \
                (operator.args in [['v', 'v'], ['v', 'p']]):
              out.write('{hbar}\n{code}'. \
                      format(hbar=common.hbar,
                             code=gen_assignment_operators(operator)))


        out.write('''{hbar}

                     }} // namespace nsimd

                     #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)


================================================
FILE: egg/gen_archis.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import operators
import common
import gen_adv_c_api
import os
from datetime import date
import sys

# -----------------------------------------------------------------------------
# Generate code for output

def get_simd_implementation_src(operator, simd_ext, from_typ, fmtspec):
    if simd_ext == 'cpu':
        vlen = common.CPU_NBITS // int(from_typ[1:])
        vasi = []
        params = operator.params[1:]
        for i in range(len(params)):
            if params[i] in ['v', 'l', 'vi']:
                vasi.append('a{}.v{{i}}'.format(i))
            else:
                vasi.append('a{}'.format(i))
        vasi = ', '.join(vasi)
        typ2 = 'f32' if from_typ == 'f16' else from_typ
        if operator.params[0] == '_':
            body = '\n'.join(
                        ['nsimd_scalar_{op_name}_{typ2}({vasi});'. \
                         format(op_name=operator.name, typ2=typ2,
                                vasi=vasi.format(i=i)) for i in range(vlen)])
        else:
            body = 'nsimd_cpu_v{} ret;\n'.format(from_typ)
            body += '\n'.join(
                    ['ret.v{i} = nsimd_scalar_{op_name}_{typ2}({vasi});'. \
                     format(i=i, op_name=operator.name, typ2=typ2,
                            vasi=vasi.format(i=i)) for i in range(vlen)])
            body += '\nreturn ret;\n'
        return \
        '''{hbar}

           NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
           nsimd_{name}_{simd_ext}_{suf}({c_args}) {{
             {body}
           }}

           #if NSIMD_CXX > 0
           namespace nsimd {{
             NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
             {name}({cxx_args}) {{
               {body}
             }}
           }} // namespace nsimd
           #endif

           '''.format(body=body, **fmtspec)
    if from_typ == 'f16':
        n = len(operator.params[1:])
        f16_to_f32 = '\n'.join(
                    ['nsimd_{simd_ext}_vf32x2 buf{i}' \
                     ' = nsimd_upcvt_{simd_ext}_f32_f16({args});'. \
                     format(i=i, args=common.get_arg(i), **fmtspec) \
                     for i in range(n)])
        bufsv0 = ', '.join(['buf{}.v0'.format(i) for i in range(n)])
        bufsv1 = ', '.join(['buf{}.v1'.format(i) for i in range(n)])
        if operator.params[0] != '_':
            retv0 = 'nsimd_{simd_ext}_vf32 retv0 = '.format(**fmtspec)
            retv1 = 'nsimd_{simd_ext}_vf32 retv1 = '.format(**fmtspec)
            f32_to_f16 = \
            'return nsimd_downcvt_{simd_ext}_f16_f32(retv0, retv1);'. \
            format(**fmtspec)
        else:
            retv0 = ''
            retv1 = ''
            f32_to_f16 = ''
        retv0 += '{sleef_symbol_prefix}_{simd_ext}_f32({bufsv0});'. \
                 format(bufsv0=bufsv0, **fmtspec)
        retv1 += '{sleef_symbol_prefix}_{simd_ext}_f32({bufsv1});'. \
                 format(bufsv1=bufsv1, **fmtspec)
        return \
        '''{hbar}

           NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
           nsimd_{name}_{simd_ext}_{suf}({c_args}) {{
             {f16_to_f32}
             {retv0}
             {retv1}
           {f32_to_f16}}}

           #if NSIMD_CXX > 0
           namespace nsimd {{
             NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
             {name}({cxx_args}) {{
               {f16_to_f32}
               {retv0}
               {retv1}
             {f32_to_f16}}}
           }} // namespace nsimd
           #endif

           '''.format(f16_to_f32=f16_to_f32, retv0=retv0, retv1=retv1,
                      f32_to_f16=f32_to_f16, **fmtspec)
    else:
        return \
        '''{hbar}

           #if NSIMD_CXX > 0
           extern "C" {{
           #endif

           NSIMD_DLLSPEC {return_typ} NSIMD_VECTORCALL
           {sleef_symbol_prefix}_{simd_ext}_{suf}({c_args});

           #if NSIMD_CXX > 0
           }} // extern "C"
           #endif

           NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
           nsimd_{name}_{simd_ext}_{suf}({c_args}) {{
             {returns}{sleef_symbol_prefix}_{simd_ext}_{suf}({vas});
           }}

           #if NSIMD_CXX > 0
           namespace nsimd {{
             NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
             {name}({cxx_args}) {{
               {returns}{sleef_symbol_prefix}_{simd_ext}_{suf}({vas});
             }}
           }} // namespace nsimd
           #endif

           '''.format(**fmtspec)

# -----------------------------------------------------------------------------
# Generate code for output

def get_simd_implementation(opts, operator, mod, simd_ext):
    typ_pairs = []
    for t in operator.types:
        return_typs = common.get_output_types(t, operator.output_to)
        for tt in return_typs:
            typ_pairs.append([t, tt])

    if not operator.closed:
        tmp = [p for p in typ_pairs if p[0] in common.ftypes and \
                                       p[1] in common.ftypes]
        tmp += [p for p in typ_pairs if p[0] in common.itypes and \
                                        p[1] in common.itypes]
        tmp += [p for p in typ_pairs if p[0] in common.utypes and \
                                        p[1] in common.utypes]
        tmp += [p for p in typ_pairs \
                if (p[0] in common.utypes and p[1] in common.itypes) or \
                   (p[0] in common.itypes and p[1] in common.utypes)]
        tmp += [p for p in typ_pairs \
                if (p[0] in common.iutypes and p[1] in common.ftypes) or \
                   (p[0] in common.ftypes and p[1] in common.iutypes)]
        typ_pairs = tmp

    ret = ''
    for pair in typ_pairs:
        from_typ = pair[0]
        to_typ = pair[1]
        fmtspec = operator.get_fmtspec(from_typ, to_typ, simd_ext)
        if operator.src:
            ret += get_simd_implementation_src(operator, simd_ext, from_typ,
                                               fmtspec)
        else:
            ret += \
            '''{hbar}

               NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
               nsimd_{name}_{simd_ext}_{suf}({c_args}) {{
                 {content}
               }}

               #if NSIMD_CXX > 0
               namespace nsimd {{
                 NSIMD_INLINE {return_typ} NSIMD_VECTORCALL
                 {name}({cxx_args}) {{
                   {returns}nsimd_{name}_{simd_ext}_{suf}({vas});
                 }}
               }} // namespace nsimd
               #endif

               '''.format(content=mod.get_impl(opts, operator.name,
                          simd_ext, from_typ, to_typ), **fmtspec)
    return ret[0:-2]


# -----------------------------------------------------------------------------
# Generate code for output

def gen_archis_write_put(opts, platform, simd_ext, simd_dir):
    filename = os.path.join(simd_dir, 'put.h')
    if not common.can_create_filename(opts, filename):
        return
    op = None
    with common.open_utf8(opts, filename) as out:
        out.write( \
        '''#ifndef NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H
           #define NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H

           {include_cpu_put}#include <nsimd/{platform}/{simd_ext}/types.h>
           #include <stdio.h>

           {hbar}

           '''.format(year=date.today().year, hbar=common.hbar,
                      simd_ext=simd_ext, platform=platform,
                      PLATFORM=platform.upper(), SIMD_EXT=simd_ext.upper(),
                      include_cpu_put='#include <nsimd/cpu/cpu/put.h>\n' \
                      if simd_ext != 'cpu' else ''))
        for typ in common.types:
            out.write( \
            '''#if NSIMD_CXX > 0
               extern "C" {{
               #endif

               NSIMD_DLLSPEC int NSIMD_VECTORCALL
               nsimd_put_{simd_ext}_{typ}(FILE *, const char *,
                                          nsimd_{simd_ext}_v{typ});

               #if NSIMD_CXX > 0
               }} // extern "C"
               #endif

               #if NSIMD_CXX > 0
               namespace nsimd {{
               NSIMD_INLINE int NSIMD_VECTORCALL
               put(FILE *out, const char *fmt, nsimd_{simd_ext}_v{typ} a0,
                   {typ}, {simd_ext}) {{
                 return nsimd_put_{simd_ext}_{typ}(out, fmt, a0);
               }}
               }} // namespace nsimd
               #endif

               {hbar}

               #if NSIMD_CXX > 0
               extern "C" {{
               #endif

               NSIMD_DLLSPEC int NSIMD_VECTORCALL
               nsimd_put_{simd_ext}_l{typ}(FILE *, const char *,
                                           nsimd_{simd_ext}_vl{typ});

               #if NSIMD_CXX > 0
               }} // extern "C"
               #endif

               #if NSIMD_CXX > 0
               namespace nsimd {{
               NSIMD_INLINE int NSIMD_VECTORCALL
               putl(FILE *out, const char *fmt, nsimd_{simd_ext}_vl{typ} a0,
                    {typ}, {simd_ext}) {{
                 return nsimd_put_{simd_ext}_l{typ}(out, fmt, a0);
               }}
               }} // namespace nsimd
               #endif

               {hbar}
               '''.format(simd_ext=simd_ext, hbar=common.hbar, typ=typ))
        out.write('#endif')
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Generate code for architectures

def gen_archis_write_file(opts, op, platform, simd_ext, simd_dir):
    filename = os.path.join(simd_dir, '{}.h'.format(op.name))
    if not common.can_create_filename(opts, filename):
        return
    mod = opts.platforms[platform]
    additional_include = mod.get_additional_include(op.name, platform,
                                                    simd_ext)
    if op.src:
        additional_include += \
        '''#include <nsimd/{platform}/{simd_ext}/downcvt.h>
           #include <nsimd/{platform}/{simd_ext}/upcvt.h>
           '''.format(platform=platform, simd_ext=simd_ext)
    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#ifndef {guard}
           #define {guard}

           #include <nsimd/{platform}/{simd_ext}/types.h>
           {additional_include}

           {code}

           {hbar}

           #endif
           '''.format(additional_include=additional_include,
                      year=date.today().year,
                      guard=op.get_header_guard(platform, simd_ext),
                      platform=platform, simd_ext=simd_ext,
                      func=op.name, hbar=common.hbar,
                      code=get_simd_implementation(opts, op, mod, simd_ext)))
    common.clang_format(opts, filename)

def gen_archis_simd(opts, platform, simd_ext, simd_dir):
    for op_name, operator in operators.operators.items():
        gen_archis_write_file(opts, operator, platform, simd_ext, simd_dir)
    gen_archis_write_put(opts, platform, simd_ext, simd_dir)

def gen_archis_types(opts, simd_dir, platform, simd_ext):
    filename = os.path.join(simd_dir, 'types.h')
    if not common.can_create_filename(opts, filename):
        return
    mod = opts.platforms[platform]
    c_code = '\n'.join([mod.get_type(opts, simd_ext, t,
                                     'nsimd_{}_v{}'.format(simd_ext, t)) \
                                     for t in common.types])
    c_code += '\n\n'
    c_code += '\n'.join([mod.get_logical_type(
                             opts, simd_ext, t, 'nsimd_{}_vl{}'. \
                             format(simd_ext, t)) for t in common.types])
    if mod.has_compatible_SoA_types(simd_ext):
        for deg in range(2, 5):
            c_code += '\n'.join([mod.get_SoA_type(simd_ext, typ, deg,
                                'nsimd_{}_v{}x{}'.format(simd_ext, typ, deg)) \
                                for typ in common.types])
    else:
        c_code += '\n'.join([
        '''
        typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x2 {{
          nsimd_{simd_ext}_v{typ} v0;
          nsimd_{simd_ext}_v{typ} v1;
        }} nsimd_{simd_ext}_v{typ}x2;
        '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types])

        c_code += '\n'.join([
        '''
        typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x3 {{
          nsimd_{simd_ext}_v{typ} v0;
          nsimd_{simd_ext}_v{typ} v1;
          nsimd_{simd_ext}_v{typ} v2;
        }} nsimd_{simd_ext}_v{typ}x3;
        '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types])

        c_code += '\n'.join([
        '''
        typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x4 {{
          nsimd_{simd_ext}_v{typ} v0;
          nsimd_{simd_ext}_v{typ} v1;
          nsimd_{simd_ext}_v{typ} v2;
          nsimd_{simd_ext}_v{typ} v3;
        }} nsimd_{simd_ext}_v{typ}x4;
        '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types])
        c_code += '\n\n'
    cxx_code = \
        '\n\n'.join(['''template <>
                        struct simd_traits<{typ}, {simd_ext}> {{
                          typedef nsimd_{simd_ext}_v{typ} simd_vector;
                          typedef nsimd_{simd_ext}_v{typ}x2 simd_vectorx2;
                          typedef nsimd_{simd_ext}_v{typ}x3 simd_vectorx3;
                          typedef nsimd_{simd_ext}_v{typ}x4 simd_vectorx4;
                          typedef nsimd_{simd_ext}_vl{typ} simd_vectorl;
                        }};'''.format(typ=t, simd_ext=simd_ext)
                        for t in common.types])
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_{platform}_{SIMD_EXT}_TYPES_H
                     #define NSIMD_{platform}_{SIMD_EXT}_TYPES_H

                     {c_code}

                     #define NSIMD_{simd_ext}_NB_REGISTERS  {nb_registers}

                     #if NSIMD_CXX > 0
                     namespace nsimd {{

                     // defined in nsimd.h for C++20 concepts
                     // struct {simd_ext} {{}};

                     {cxx_code}

                     }} // namespace nsimd
                     #endif

                     #endif
                     '''. \
                     format(year=date.today().year, platform=platform.upper(),
                            SIMD_EXT=simd_ext.upper(), simd_ext=simd_ext,
                            c_code=c_code, cxx_code=cxx_code,
                            nb_registers=mod.get_nb_registers(simd_ext)))
    common.clang_format(opts, filename)

def gen_archis_platform(opts, platform):
    include_dir = os.path.join(opts.include_dir, platform);
    for s in opts.platforms[platform].get_simd_exts():
        common.myprint(opts, 'Found new SIMD extension: {}'.format(s))
        if s in opts.simd:
            simd_dir = os.path.join(include_dir, s)
            common.mkdir_p(simd_dir)
            gen_archis_types(opts, simd_dir, platform, s)
            gen_archis_simd(opts, platform, s, simd_dir)
        else:
            common.myprint(opts, '  Extension excluded by command line')

def doit(opts):
    common.myprint(opts, 'Generating SIMD implementations')
    opts.platforms = common.get_platforms(opts)
    for p in opts.platforms:
        common.mkdir_p(os.path.join(opts.include_dir, p))
        gen_archis_platform(opts, p)


================================================
FILE: egg/gen_base_apis.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import operators
import common
import os
from datetime import date
import sys

# -----------------------------------------------------------------------------
# C base generic implem

def get_c_base_generic(operator):
    vas = common.get_args(len(operator.params) - 1)
    sig = operator.get_generic_signature('c_base')
    if not operator.closed:
        return \
        '''{sig} NSIMD_PP_CAT_6(nsimd_{name}_, NSIMD_SIMD, _, \\
                                to_type, _, from_type)({vas})

           {sig_e} NSIMD_PP_CAT_6(nsimd_{name}_, simd_ext, _, \\
                                  to_type, _, from_type)({vas})'''. \
           format(sig=sig[0], sig_e=sig[1], name=operator.name, vas=vas)
    else:
        return \
        '''{sig} NSIMD_PP_CAT_4(nsimd_{name}_, NSIMD_SIMD, _, type)({vas})

           {sig_e} NSIMD_PP_CAT_4(nsimd_{name}_, simd_ext, _, type)({vas})'''. \
           format(sig=sig[0], sig_e=sig[1], name=operator.name, vas=vas)

# -----------------------------------------------------------------------------
# C++ base generic implem

def get_cxx_base_generic(operator):
    returns = '' if operator.params[0] == '_' else 'return'
    temp = common.get_args(len(operator.params) - 1)
    temp += ', ' if temp != '' else ''
    args = temp + 'F(), T()' if not operator.closed else temp + 'T()'
    return \
    '''#if NSIMD_CXX > 0
       namespace nsimd {{
       {sig} {{
         {returns} {name}({args}, NSIMD_SIMD());
       }}
       }} // namespace nsimd
       #endif'''.format(name=operator.name, args=args, returns=returns,
                        sig=operator.get_generic_signature('cxx_base')[:-1])

# -----------------------------------------------------------------------------
# Declarations for output

def get_put_decl():
    return \
    '''#include NSIMD_AUTO_INCLUDE(put.h)

       #define vput(out, fmt, a0, type) \
           NSIMD_PP_CAT_4(nsimd_put_, NSIMD_SIMD, _, type)(out, fmt, a0)

       #define vput_e(out, fmt, a0, type, simd_ext) \
           NSIMD_PP_CAT_4(nsimd_put_, simd_ext, _, type)(out, fmt, a0)

       #if NSIMD_CXX > 0
       namespace nsimd {
       template <typename A0, typename T>
       int put(FILE *out, const char *fmt, A0 a0, T) {
         return put(out, fmt, a0, T(), NSIMD_SIMD());
       }
       } // namespace nsimd
       #endif
       '''

# -----------------------------------------------------------------------------
# Generate base APIs

def doit(opts):
    common.myprint(opts, 'Generating base APIs')
    common.mkdir_p(opts.include_dir)
    filename = os.path.join(opts.include_dir, 'functions.h')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_FUNCTIONS_H
                     #define NSIMD_FUNCTIONS_H

                     '''.format(year=date.today().year))

        for op_name, operator in operators.operators.items():
            out.write('''{}

                         #include NSIMD_AUTO_INCLUDE({}.h)

                         {}

                         {}

                         '''.format(common.hbar, operator.name,
                                    get_c_base_generic(operator),
                                    get_cxx_base_generic(operator)))

        out.write('''{hbar}

                     {put_decl}

                     {hbar}

                     #endif'''. \
                     format(hbar=common.hbar, put_decl=get_put_decl()))
    common.clang_format(opts, filename)


================================================
FILE: egg/gen_benches.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import sys
import common
import operators
from datetime import date
from collections import OrderedDict

# -----------------------------------------------------------------------------
# Sig

def sig_replace_name(sig, name):
    sig = sig.split(' ')
    sig[1] = name
    return ' '.join(sig)

def sig_translate(sig, translates, name=None):
    sig = sig.split(' ')
    ## Translates a given type to another
    sig[0] = translates.get(sig[0], sig[0])
    ## Do not use sig[1] (the function name)
    for i, p in enumerate(sig[2:]):
        sig[2 + i] = translates.get(p, p)
    sig = ' '.join(sig)
    ## Redefine name if available
    if name:
        sig = sig_replace_name(sig, name)
    return sig

# -----------------------------------------------------------------------------
# Errors

class BenchError(RuntimeError):
    pass

# -----------------------------------------------------------------------------
# Markers

def asm_marker(simd, bench_name):
    r = ''
    r += '#ifdef ASM_MARKER'
    r += '\n'

    for_intel = '__asm__ __volatile__("callq __asm_marker__{bench_name}");'. \
                format(bench_name=bench_name)
    for_arm = '__asm__ __volatile__("bl __asm_marker__{bench_name}");'. \
              format(bench_name=bench_name)
    if simd in common.x86_simds:
        r += for_intel
    elif simd in common.arm_simds:
        r += for_arm
    elif simd == 'cpu':
        r += '''#if defined(NSIMD_X86)
                  {}
                #elif defined(NSIMD_ARM)
                  {}
                #endif'''.format(for_intel, for_arm)
    elif simd in common.ppc_simds:
        #TODO
        return ''. format(bench_name=bench_name)
    else:
        raise BenchError('Unable to write marker for SIMD: {}'.format(simd))
    r += '\n'
    r += '#endif'
    return r

# -----------------------------------------------------------------------------
# Metaclass

# Provides __static_init__ hook
class StaticInitMetaClass(type):
    def __new__(cls, name, bases, dct):
        x = type.__new__(cls, name, bases, dct)
        x.__static_init__(x)
        return x

# -----------------------------------------------------------------------------
# Basic nsimd types

## Will be automatically populated thanks to the metaclass
types = {}

# -----------------------------------------------------------------------------

class TypeBase(object, metaclass=StaticInitMetaClass):

    @staticmethod
    def __static_init__(c):
        ## Skip base class
        if c.__name__.endswith('Base'):
            return
        types[c.name] = c()

    def is_simd(self):
        return False

    def is_volatile(self):
        return False

class TypeVectorBase(TypeBase):
    def is_simd(self):
        return True

# -----------------------------------------------------------------------------

class TypeVoid(TypeBase):
    name = '_'

    def as_type(self, typ):
        return 'void'

# -----------------------------------------------------------------------------

class TypeScalar(TypeBase):
    name = 's'

    def as_type(self, typ):
        return typ

    def code_load(self, simd, typ, ptr):
        return '*({})'.format(ptr)

    def code_store(self, simd, typ, lhs, rhs):
        return '*({}) = {}'.format(lhs, rhs)

# -----------------------------------------------------------------------------

class TypeVolatileScalar(TypeScalar):
    name = 'volatile-s'

    def is_volatile(self):
        return True

# -----------------------------------------------------------------------------

class TypeLogicalScalar(TypeBase):
    name = 'ls'

    def as_type(self, typ):
        return {
            'i8': 'u8',
            'i16': 'u16',
            'i32': 'u32',
            'i64': 'u64',
            'f32': 'u32',
            'f64': 'u64',
            }.get(typ, typ)

    def code_load(self, simd, typ, ptr):
        return '({})(*({}))'.format(self.as_type(typ), ptr)

    def code_store(self, simd, typ, lhs, rhs):
        return '*({}) = ({})({})'.format(lhs, typ, rhs)

# -----------------------------------------------------------------------------

class TypeVolatileLogicalScalar(TypeLogicalScalar):
    name = 'volatile-ls'

    def is_volatile(self):
        return True

# -----------------------------------------------------------------------------

class TypeInt(TypeScalar):
    name = 'p'

    def as_type(self, typ):
        return 'int'

# -----------------------------------------------------------------------------

class TypePtr(TypeBase):
    name = '*'

    def as_type(self, typ):
        return typ + '*'

# -----------------------------------------------------------------------------

class TypeConstPtr(TypeBase):
    name = 'c*'

    def as_type(self, typ):
        return 'const ' + typ + '*'

# -----------------------------------------------------------------------------

class TypeVector(TypeVectorBase):
    name = 'v'

    def as_type(self, typ):
        return 'v' + typ

    def code_load(self, simd, typ, ptr):
        return 'nsimd::loada({}, {}())'.format(ptr, typ)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ)

# -----------------------------------------------------------------------------

class TypeCPUVector(TypeVector):
    name = 'vcpu'

    def code_load(self, simd, typ, ptr):
        return 'nsimd::loada({}, {}(), nsimd::cpu())'.format(ptr, typ)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storea({}, {}, {}(), nsimd::cpu())'.format(ptr, expr, typ)

# -----------------------------------------------------------------------------

class TypeUnrolledVectorBase(TypeVectorBase):
    def as_type(self, typ):
        raise NotImplemented()

    def code_load(self, simd, typ, ptr):
        return 'nsimd::loada<nsimd::pack<{}, {}>>({})'. \
               format(typ, self.unroll, ptr)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storea({}, {})'.format(ptr, expr)

# -----------------------------------------------------------------------------

class TypeUnrolledVector1(TypeUnrolledVectorBase):
    name = 'vu1'
    unroll = 1

class TypeUnrolledVector2(TypeUnrolledVectorBase):
    name = 'vu2'
    unroll = 2

class TypeUnrolledVector3(TypeUnrolledVectorBase):
    name = 'vu3'
    unroll = 3

class TypeUnrolledVector4(TypeUnrolledVectorBase):
    name = 'vu4'
    unroll = 4

class TypeUnrolledVector5(TypeUnrolledVectorBase):
    name = 'vu5'
    unroll = 5

class TypeUnrolledVector6(TypeUnrolledVectorBase):
    name = 'vu6'
    unroll = 6

class TypeUnrolledVector7(TypeUnrolledVectorBase):
    name = 'vu7'
    unroll = 7

class TypeUnrolledVector8(TypeUnrolledVectorBase):
    name = 'vu8'
    unroll = 8

class TypeUnrolledVector9(TypeUnrolledVectorBase):
    name = 'vu9'
    unroll = 9

# -----------------------------------------------------------------------------

class TypeVectorX2(TypeVectorBase):
    name = 'vx2'

    def as_type(self, typ):
        return 'v' + typ + 'x2'

# -----------------------------------------------------------------------------

class TypeVectorX3(TypeVectorBase):
    name = 'vx3'

    def as_type(self, typ):
        return 'v' + typ + 'x3'

# -----------------------------------------------------------------------------

class TypeVectorX4(TypeVectorBase):
    name = 'vx4'

    def as_type(self, typ):
        return 'v' + typ + 'x4'

# -----------------------------------------------------------------------------

class TypeLogical(TypeVectorBase):
    name = 'l'

    def as_type(self, typ):
        return 'vl' + typ

    def code_load(self, simd, typ, ptr):
        return 'nsimd::loadla({}, {}())'.format(ptr, typ)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storela({}, {}, {}())'.format(ptr, expr, typ)

# -----------------------------------------------------------------------------

class TypeCPULogical(TypeLogical):
    name = 'lcpu'

    def code_load(self, simd, typ, ptr):
        return 'nsimd::loadla({}, {}(), nsimd::cpu())'.format(ptr, typ)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storela({}, {}, {}(), nsimd::cpu())'.format(ptr, expr, typ)

# -----------------------------------------------------------------------------

class TypeUnrolledLogicalBase(TypeVectorBase):
    def as_type(self, typ):
        raise NotImplemented()

    def code_load(self, simd, typ, ptr):
        return 'nsimd::loadla<nsimd::packl<{}, {}>>({})'. \
               format(typ, self.unroll, ptr)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storela({}, {})'.format(ptr, expr)

# -----------------------------------------------------------------------------

class TypeUnrolledLogical1(TypeUnrolledLogicalBase):
    name = 'lu1'
    unroll = 1

class TypeUnrolledLogical2(TypeUnrolledLogicalBase):
    name = 'lu2'
    unroll = 2

class TypeUnrolledLogical3(TypeUnrolledLogicalBase):
    name = 'lu3'
    unroll = 3

class TypeUnrolledLogical4(TypeUnrolledLogicalBase):
    name = 'lu4'
    unroll = 4

class TypeUnrolledLogical5(TypeUnrolledLogicalBase):
    name = 'lu5'
    unroll = 5

class TypeUnrolledLogical6(TypeUnrolledLogicalBase):
    name = 'lu6'
    unroll = 6

class TypeUnrolledLogical7(TypeUnrolledLogicalBase):
    name = 'lu7'
    unroll = 7

class TypeUnrolledLogical8(TypeUnrolledLogicalBase):
    name = 'lu8'
    unroll = 8

class TypeUnrolledLogical9(TypeUnrolledLogicalBase):
    name = 'lu9'
    unroll = 9

# -----------------------------------------------------------------------------

class TypeBoostSimdVector(TypeVectorBase):
    name = 'boost::simd::pack'

    def as_type(self, typ):
        return 'boost::simd::pack<{}>'.format(typ)

    def code_load(self, simd, typ, ptr):
        return '{}({})'.format(self.as_type(typ), ptr)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ)

# -----------------------------------------------------------------------------

class TypeBoostSimdLogicalVector(TypeVectorBase):
    name = 'boost::simd::lpack'

    def as_type(self, typ):
        return 'boost::simd::pack<boost::simd::logical<{}>>'.format(typ)

    def code_load(self, simd, typ, ptr):
        return '{}({})'.format(self.as_type(typ), ptr)

    def code_store(self, simd, typ, ptr, expr):
        return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ)

# -----------------------------------------------------------------------------

class TypeMIPPReg(TypeVectorBase):
    name = 'mipp::reg'

    def as_type(self, typ):
        return 'mipp::Reg<{}>'.format(typ)

    def code_load(self, simd, typ, ptr):
        return 'mipp::load<{}>({})'.format(typ, ptr)

    def code_store(self, simd, typ, ptr, expr):
        return 'mipp::store({}, {})'.format(ptr, expr)

# -----------------------------------------------------------------------------

class TypeMIPPMsk(TypeVectorBase):
    name = 'mipp::msk'

    def as_type(self, typ):
        return 'mipp::Msk<{}>'.format(typ)

    def code_load(self, simd, typ, ptr):
        if simd in ['avx512_knl', 'avx512_skylake']:
            return '*({})'.format(ptr)
        else:
            return 'mipp::load<{}>({})'.format(typ, ptr)

    def code_store(self, simd, typ, ptr, expr):
        if simd in ['avx512_knl', 'avx512_skylake']:
            return '*({}) = {}'.format(ptr, expr)
        else:
            return 'mipp::store({}, reinterpret_cast<mipp::reg>({}))'.format(ptr, expr)

# -----------------------------------------------------------------------------

def type_of(param):
    if param in types:
        return types[param]
    else:
        raise BenchError("Unable to find corresponding type for: " + param)

def as_type(param, typ):
    return type_of(param).as_type(typ)

# -----------------------------------------------------------------------------
# Operator class needs to be reinforced for benches

class BenchOperator(object, metaclass=type):
    def __init__(self):
        self.typed_params_ = []
        for p in self.params:
            self.typed_params_.append(type_of(p))

    @property
    def function_name(self):
        return self.name.split('::')[-1].split('<')[0]

    ## Generates list of includes to be included
    def gen_includes(self, lang):
        includes = []
        includes.append('<nsimd/nsimd.h>')
        if lang == 'cxx_adv':
            includes.append('<nsimd/cxx_adv_api.hpp>')
        if lang == 'c_base':
            includes += ['<stdlib.h>', '<stdio.h>', '<errno.h>', '<string.h>']
        else:
            includes += ['<cstdlib>', '<cstdio>', '<cerrno>', '<cstring>',
                         '<algorithm>']
        return includes

    def match_sig(self, signature):
        (name, params) = common.parse_signature(signature)
        if len(params) != len(self.params):
            return False
        for p1, p2 in zip(params, self.params):
            if p1 != p2:
                return False
        return True

    def bench_code_before(self, typ):
        return ''

    def bench_against_init(self):
        bench = {}
        for simd in ['*'] + common.simds:
            bench[simd] = OrderedDict()
            for typ in ['*'] + common.types:
                bench[simd][typ] = OrderedDict()
        return bench

    def bench_against_cpu(self):
        bench = self.bench_against_init()
        ## Enable bench against nsimd (cpu architecture)
        if self.bench_auto_against_cpu:
            bench['*']['*'][common.nsimd_category('cpu')] = \
                    cpu_fun_from_sig(sig_translate(self.signature, {
                                     's': 'volatile-s',
                                     'v': 'vcpu',
                                     'l': 'lcpu',
                                     }))
        return bench

    def bench_against_libs(self):
        bench = self.bench_against_init()
        ## Enable bench against all other libraries
        if self.bench_auto_against_mipp:
            for typ in self.bench_mipp_types():
                ## MIPP always requires template
                mipp_name = self.bench_mipp_name(typ)
                signature = sig_translate(self.signature, {
                    'v': 'mipp::reg',
                    'l': 'mipp::msk',
                    }, name=mipp_name)
                if signature:
                    bench['*'][typ]['MIPP'] = signature
        if self.bench_auto_against_sleef:
            for simd in common.simds:
                for typ in self.bench_sleef_types():
                    if not common.sleef_support_type(simd, typ):
                        continue
                    sleef_name = self.bench_sleef_name(simd, typ)
                    if sleef_name is None:
                        continue
                    ## IMPORTANT:
                    ## If simd is cpu, then make the signature using scalar
                    if simd == 'cpu':
                        signature = sig_translate(self.signature, {
                            's': 'volatile-s',
                            'v': 'volatile-s',
                            'l': 'volatile-s',
                            }, sleef_name)
                    else:
                        signature = sig_translate(self.signature, {},
                                                        sleef_name)
                    if signature:
                        bench[simd][typ]['Sleef'] = signature
        if self.bench_auto_against_std:
            for simd in common.simds:
                for typ in self.bench_std_types():
                    std_name = self.bench_std_name(simd, typ)
                    signature = sig_translate(self.signature, {
                        's': 'volatile-s',
                        'v': 'volatile-s',
                        'l': 'volatile-s',
                        }, std_name)
                    if signature:
                        if self.cxx_operator:
                            bench[simd][typ]['std'] = std_operator_from_sig(signature,
                                    self.cxx_operator)
                        else:
                            bench[simd][typ]['std'] = std_fun_from_sig(signature)
        return bench

    def code_call(self, typ, args):
        return 'nsimd::{}({}, {}())'.format(self.name,
                                            common.pprint_commas(args), typ)

    def code_ptr_step(self, typ, simd):
        if any(p.is_simd() for p in self.typed_params_):
            return 'vlen_e({}, {})'.format(typ, simd)
        else:
            return '1'

class BenchOperatorWithNoMakers(BenchOperator):
    use_for_parsing = False

    # Classes that inherit from me do not have their name member
    # which is mandatory so I fill it for them here.
    def __init__(self):
        BenchOperator.__init__(self)
        (self.name, void) = common.parse_signature(self.signature)

# -----------------------------------------------------------------------------
# Make the list of all operators, they will inherit from the corresponding
# operators.Operator and then from BenchOperator

functions = {}

class dummy(operators.MAddToOperators):
    def __new__(cls, name, bases, dct):
        return type.__new__(cls, name, bases, dct)

for op_name, operator in operators.operators.items():
    if operator.load_store: # We do not bench loads/stores
        continue
    op_class = dummy(operator.__class__.__name__,
                     (operator.__class__, BenchOperator), {})
    functions[op_name] = op_class()

# -----------------------------------------------------------------------------
# Function helpers

def nsimd_unrolled_fun_from_sig(from_sig, unroll):
    sig = sig_translate(from_sig, {
        'v': 'vu' + str(unroll),
        'l': 'lu' + str(unroll),
        })
    class InlineNSIMDUnrolledFun(operators.Operator, BenchOperatorWithNoMakers,
                                 metaclass=dummy):
        signature = sig
        def code_call(self, typ, args):
            return 'nsimd::{}({})'.format(self.name,
                                          common.pprint_commas(args))
        def code_ptr_step(self, typ, simd):
            return 'nsimd::len(nsimd::pack<{}, {}, nsimd::{}>())'.format(typ, unroll, simd)
    return InlineNSIMDUnrolledFun()

def fun_from_sig(from_sig):
    class InlineFun(operators.Operator, BenchOperatorWithNoMakers,
                    metaclass=dummy):
        signature = from_sig
        def code_call(self, typ, args):
            return '{}({})'.format(self.name, common.pprint_commas(args))
    return InlineFun()

def std_fun_from_sig(from_sig):
    return fun_from_sig(from_sig)

def std_operator_from_sig(from_sig, op):
    class InlineStdOperatorFun(operators.Operator, BenchOperatorWithNoMakers,
                               metaclass=dummy):
        __metaclass__ = dummy
        signature = from_sig
        operator = op
        def code_call(self, typ, args):
            if len(args) == 1:
                return '{}({})'.format(self.operator, args[0])
            elif len(args) == 2:
                return '{} {} {}'.format(args[0], self.operator, args[1])
            else:
                raise BenchError('std:: operators requires 1 or 2 arguments!')
    return InlineStdOperatorFun()

def cpu_fun_from_sig(from_sig):
    class InlineCPUFun(operators.Operator, BenchOperatorWithNoMakers,
                       metaclass=dummy):
        signature = from_sig
        def code_call(self, typ, args):
            return 'nsimd::{}({}, {}(), nsimd::cpu())'. \
                   format(self.name, common.pprint_commas(args), typ)
    return InlineCPUFun()

def sanitize_fun_name(name):
    return ''.join(map(lambda c: c if c.isalnum() else '_', name))

# -----------------------------------------------------------------------------
# Code

def code_cast(typ, expr):
    return '({})({})'.format(typ, expr)

def code_cast_ptr(typ, expr):
    return code_cast(typ + '*', expr)

# -----------------------------------------------------------------------------
# Globals

_opts = None
_lang = 'cxx_adv'

# -----------------------------------------------------------------------------
# Generates

def TODO(f):
    if _opts.verbose:
        common.myprint(opts, '@@ TODO: ' + f.name)

def gen_filename(f, simd, typ):
    ## Retrieve directory from global options
    benches_dir = common.mkdir_p(os.path.join(_opts.benches_dir, _lang))
    ## Generate path (composed from: function name + type + extension)
    return os.path.join(benches_dir, '{}.{}.{}.{}'.format(
        f.name, simd, typ, common.ext_from_lang(_lang)))

def gen_bench_name(category, name, unroll=None):
    bench_name = '{}_{}'.format(category, name)
    if unroll:
        bench_name += '_unroll{}'.format(unroll)
    return bench_name

def gen_bench_from_code(f, typ, code, bench_with_timestamp):
    header = ''
    header += common.pprint_includes(f.gen_includes(_lang))
    header += \
    '''

    // Required for random generation
    #include "../benches.hpp"

    // Google benchmark
    #ifndef DISABLE_GOOGLE_BENCHMARK
    #include <benchmark/benchmark.h>
    #endif

    #include <ctime>
    double timestamp_ns() {
      timespec ts;
      clock_gettime(CLOCK_MONOTONIC, &ts);
      return double(ts.tv_sec) * 1000000000.0 + double(ts.tv_nsec);
    }

    // std
    #include <cmath>
    // #include <map>
    #include <numeric>
    // #include <fstream>

    // Sleef
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wignored-qualifiers"
    #include <sleef.h>
    #pragma GCC diagnostic pop

    // MIPP
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wconversion"
    #pragma GCC diagnostic ignored "-Wsign-conversion"
    #pragma GCC diagnostic ignored "-Wdouble-promotion"
    #pragma GCC diagnostic ignored "-Wunused-parameter"
    #if defined(__clang__)
    #pragma GCC diagnostic ignored "-Wzero-length-array"
    #endif
    #include <mipp.h>
    #pragma GCC diagnostic pop
    '''
    return \
    '''{header}

    // -------------------------------------------------------------------------

    static const int sz = 1024;

    template <typename Random>
    static {type}* make_data(int sz, Random r) {{
      {type}* data = ({type}*)nsimd_aligned_alloc(sz * {sizeof});

      for (int i = 0; i < sz; ++i) {{
        data[i] = r();
      }}
      return data;
    }}

    static {type}* make_data(int sz) {{
      {type}* data = ({type}*)nsimd_aligned_alloc(sz * {sizeof});

      for (int i = 0; i < sz; ++i) {{
        data[i] = {type}(0);
      }}
      return data;
    }}

    {random_code}

    {code}

    int main(int argc, char** argv)
    {{
      std::vector<std::string> args(argv, argv + argc);

      if (std::find(args.begin(), args.end(), "--use_timestamp_ns")
          != args.end()) {{
        {bench_with_timestamp}
      }}
      #ifndef DISABLE_GOOGLE_BENCHMARK
      else {{
        ::benchmark::Initialize(&argc, argv);
        ::benchmark::RunSpecifiedBenchmarks();
      }}
      #endif

      return 0;
    }}

    '''.format(
            name=f.name,
            type=typ,
            year=date.today().year,
            random_code=f.domain.code('rand_param', typ),
            code=code,
            bench_with_timestamp=bench_with_timestamp,
            sizeof=common.sizeof(typ),
            header=header,
    )

def gen_bench_info_from(f, simd, typ):
    bench_args_init = []
    bench_args_decl = []
    bench_args_call = []
    ## Generate code for parameters
    for i, arg in enumerate(f.args):
        p = type_of(arg)
        qualifiers = ''
        if p.is_volatile():
            qualifiers += 'volatile '
        bench_args_init.append('make_data(sz, &rand_param{n})'.format(n=i))
        bench_args_decl.append('{} {}* _{}'.format(qualifiers, typ, i))
        bench_args_call.append(p.code_load(simd, typ, '_{} + i'.format(i)))
    ## Generate code for bench (using function return type)
    r = type_of(f.get_return())
    bench_call = r.code_store(simd, typ, '_r + i',
                              f.code_call(typ, bench_args_call))
    return bench_args_init, bench_args_decl, bench_args_call, bench_call

def gen_bench_asm_function(f, simd, typ, category):
    bench_args_init, bench_args_decl, \
    bench_args_call, bench_call = gen_bench_info_from(f, simd, typ)
    ## Add function that can easily be parsed to get assembly and plain code
    return \
    '''
    void {bench_name}__asm__({type}* _r, {bench_args_decl}, int sz) {{
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      // code:{{
      int n = {step};
      #if defined(NSIMD_IS_GCC)
        #pragma GCC unroll 1
      #elif defined(NSIMD_IS_CLANG)
        #pragma clang loop unroll(disable)
      #elif defined(NSIMD_IS_ICC)
        #pragma unroll(1)
      #endif
      for (int i = 0; i < sz; i += n) {{
        {bench_call};
      }}
      // code:}}
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
      __asm__ __volatile__("nop");
    }}
    '''.format(
        bench_name=gen_bench_name(category, f.function_name),
        type=typ,
        step=f.code_ptr_step(typ, simd),
        bench_call=bench_call,
        bench_args_decl=common.pprint_commas(bench_args_decl)
        )

def gen_bench_from_basic_fun(f, simd, typ, category, unroll=None):
    bench_args_init, bench_args_decl, bench_args_call, bench_call = \
            gen_bench_info_from(f, simd, typ)
    bench_name = gen_bench_name(category, f.function_name, unroll)

    code_timestamp_ns = \
    '''
    void {bench_name}({type}* _r, {bench_args_decl}, int sz) {{
      // Normalize size depending on the step so that we're not going out of boundaies
      // (Required when the size is'nt a multiple of `n`, like for unrolling benches)
      sz = (sz / {step}) * {step};
      std::cout << "{bench_name}({type}), sz = " << sz << std::endl;
      {asm_marker}
      // code: {bench_name}
      int n = {step};
      #if defined(NSIMD_IS_GCC)
        #pragma GCC unroll 1
      #elif defined(NSIMD_IS_CLANG)
        #pragma clang loop unroll(disable)
      #elif defined(NSIMD_IS_ICC)
        #pragma unroll(1)
      #endif
      for (int i = 0; i < sz; i += n) {{
        {bench_call};
      }}
      // code: {bench_name}
      {asm_marker}
    }}
    '''

    return \
    '''
    // -----------------------------------------------------------------------------

    {code_before}

    extern "C" {{ void __asm_marker__{bench_name}() {{}} }}

    #ifndef DISABLE_GOOGLE_BENCHMARK

    void {bench_name}(benchmark::State& state, {type}* _r, {bench_args_decl}, int sz) {{
      // Normalize size depending on the step so that we're not going out of boundaies
      // (Required when the size is'nt a multiple of `n`, like for unrolling benches)
      sz = (sz / {step}) * {step};
      try {{
        for (auto _ : state) {{
          {asm_marker}
          // code: {bench_name}
          int n = {step};
          #if defined(NSIMD_IS_GCC)
            #pragma GCC unroll 1
          #elif defined(NSIMD_IS_CLANG)
            #pragma clang loop unroll(disable)
          #elif defined(NSIMD_IS_ICC)
            #pragma unroll(1)
          #endif
          for (int i = 0; i < sz; i += n) {{
            {bench_call};
          }}
          // code: {bench_name}
          {asm_marker}
        }}
      }} catch (std::exception const& e) {{
        std::string message("ERROR: ");
        message += e.what();
        state.SkipWithError(message.c_str());
      }}
    }}

    BENCHMARK_CAPTURE({bench_name}, {type}, make_data(sz), {bench_args_init}, sz);

    #endif
    '''.format(
            bench_name=bench_name,
            type=typ,
            step=f.code_ptr_step(typ, simd),
            bench_call=bench_call,
            bench_args_init=common.pprint_commas(bench_args_init),
            bench_args_decl=common.pprint_commas(bench_args_decl),
            bench_args_call=common.pprint_commas(bench_args_call),
            code_before=f.bench_code_before(typ),
            asm_marker=asm_marker(simd, bench_name)
            )

def gen_code(f, simd, typ, category):
    code = None
    if f.returns_any_type:
        return TODO(f)
    ## TODO: We have to refactor this, it's annoying to add every possible signatures...
    if f.match_sig('v * v v') or f.match_sig('v * v v v') \
        or f.match_sig('l * v v') or f.match_sig('l * l l') \
        or f.match_sig('l * l') or f.match_sig('v * v') \
        or f.match_sig('s * s') \
        or f.match_sig('s * s s') \
        or f.match_sig('s * s s s') \
        or f.match_sig('vcpu * vcpu') \
        or f.match_sig('vcpu * vcpu vcpu') \
        or f.match_sig('vcpu * vcpu vcpu vcpu') \
        or f.match_sig('lcpu * lcpu') \
        or f.match_sig('lcpu * lcpu lcpu') \
        or f.match_sig('lcpu * vcpu vcpu') \
        or f.match_sig('vcpu * lcpu vcpu vcpu') \
        or f.match_sig('volatile-s * volatile-s') \
        or f.match_sig('volatile-s * volatile-s volatile-s') \
        or f.match_sig('volatile-s * volatile-s volatile-s volatile-s') \
        or f.match_sig('volatile-ls * volatile-s') \
        or f.match_sig('volatile-ls * volatile-s volatile-s') \
        or f.match_sig('volatile-ls * volatile-ls') \
        or f.match_sig('volatile-ls * volatile-ls volatile-ls') \
        or f.match_sig('volatile-s * volatile-ls volatile-s volatile-s') \
        or f.match_sig('boost::simd::pack * boost::simd::pack') \
        or f.match_sig('boost::simd::pack * boost::simd::pack boost::simd::pack') \
        or f.match_sig('boost::simd::pack * boost::simd::pack boost::simd::pack boost::simd::pack') \
        or f.match_sig('boost::simd::lpack * boost::simd::pack') \
        or f.match_sig('boost::simd::lpack * boost::simd::pack boost::simd::pack') \
        or f.match_sig('mipp::reg * mipp::reg') \
        or f.match_sig('mipp::reg * mipp::reg mipp::reg') \
        or f.match_sig('mipp::msk * mipp::reg') \
        or f.match_sig('mipp::msk * mipp::reg mipp::reg') \
        or f.match_sig('v * l v v'):
        code = gen_bench_from_basic_fun(f, simd, typ, category=category)
    if f.match_sig('p * l'):
        return TODO(f)
    if f.match_sig('l * p'):
        return TODO(f)
    if f.match_sig('v * s'):
        return TODO(f)
    if f.match_sig('l * p'):
        return TODO(f)
    if f.match_sig('p *'):
        return TODO(f)
    if f.match_sig('v * v p'):
        return TODO(f)
    if code is None:
        raise BenchError('Unable to generate bench for signature: ' + \
                         f.signature)
    return code

def gen_bench_unrolls(f, simd, typ, category):
    code = ''
    sig = f.signature
    for unroll in [2, 3, 4]:
        f = nsimd_unrolled_fun_from_sig(sig, unroll)
        code += gen_bench_from_basic_fun(f, simd, typ, category=category,
                                         unroll=unroll)
    return code

def gen_bench_against(f, simd, typ, against):
    code = ''
    # "against" dict looks like: { simd: { type: { name: sig } } }
    for s in [simd, '*']:
        if not s in against:
            continue
        for t in [typ, '*']:
            if not t in against[s]:
                continue
            for category, f in against[s][t].items():
                # Allow function to be simple str (you use this most of the
                # time)
                if isinstance(f, str):
                    f = fun_from_sig(f)
                # Now that we have a `Fun` type, we can generate code
                code += gen_code(f, simd, typ, category=category)
    return code

def gen_bench_with_timestamp(f, simd, typ, category, unroll=None):
    code = ''
    bench_args_init, bench_args_decl, bench_args_call, bench_call = \
            gen_bench_info_from(f, simd, typ)
    bench_name = gen_bench_name(category, f.function_name, unroll)
    bench_args_decl = ''
    bench_args_call = ''
    for i, arg in enumerate(f.args):
        bench_args_decl += typ + ' * data' + str(i) + ' = make_data(sz, &rand_param' + str(i) + ');' + '\n'
        if i != 0: bench_args_call += ', '
        bench_args_call += 'data' + str(i)
    code += \
      '''
      {{
        // Bench
        {typ} * r = make_data(sz);
        {bench_args_decl}
        double elapsed_times_ns[nb_runs] = {{ }}; // Must be at least 10000
        {typ} sum = {{ }};
        for (size_t run = 0; run < nb_runs; ++run) {{
          double const t0 = timestamp_ns();
          {bench_name}(r, {bench_args_call}, 1000);
          double const t1 = timestamp_ns();
          elapsed_times_ns[run] = (t1 - t0) / double(sz);
          // Compute sum
          if (rand() % 2) {{
            sum += std::accumulate(r, r + sz, {typ}());
          }} else {{
            sum -= std::accumulate(r, r + sz, {typ}());
          }}
        }}
        // Save sum and elapsed time
        std::sort(elapsed_times_ns, elapsed_times_ns + nb_runs);
        size_t const i_start = nb_runs / 2 - 10;
        size_t const i_end = nb_runs / 2 + 10;
        sums["{bench_name}"] =
          std::make_pair(sum, std::accumulate(elapsed_times_ns + i_start, elapsed_times_ns + i_end, 0.0) / double(i_end - i_start));
        // Number of elapsed times
        std::map<double, int> nb_per_elapsed_time;
        for (size_t run = 0; run < nb_runs; ++run) {{
          ++nb_per_elapsed_time[(i64(elapsed_times_ns[run] * 100)) / 100.0];
        }}
        // Draw gnuplot
        std::system("mkdir -p gnuplot");
        std::string const dat_filename = "gnuplot/benches.cxx_adv.{bench_name}.dat";
        std::ofstream dat_file(dat_filename);
        for (auto const & elapsed_time_nb : nb_per_elapsed_time) {{
          dat_file << elapsed_time_nb.first << " " << elapsed_time_nb.second << "\\n";
        }}
        std::string const gnuplot_filename = "gnuplot/benches.cxx_adv.{bench_name}.gnuplot";
        std::ofstream gnuplot_file(gnuplot_filename);
        gnuplot_file << "set term svg" << "\\n";
        gnuplot_file << "set output \\"benches.cxx_adv.{bench_name}.svg\\"" << "\\n";
        gnuplot_file << "set xlabel \\"Time in nanoseconds (lower is better)\\"" << "\\n";
        gnuplot_file << "set ylabel \\"Number of runs\\"" << "\\n";
        gnuplot_file << "\\n";
        gnuplot_file << "set style line 1 \\\\" << "\\n";
        gnuplot_file << "    linecolor rgb '#db284c' \\\\" << "\\n";
        gnuplot_file << "    linetype 1 linewidth 2" << "\\n";
        gnuplot_file << "\\n";
        gnuplot_file << "plot '" << dat_filename << "' with linespoints linestyle 1" << "\\n";
        std::system(("cd gnuplot && gnuplot \\"" + gnuplot_filename + "\\"").c_str());
      }}
      '''.format(bench_name=bench_name,
                  typ=typ,
                  bench_args_decl=bench_args_decl,
                  bench_args_call=bench_args_call,
                  )
    return code

def gen_bench_unrolls_with_timestamp(f, simd, typ, category):
    code = ''
    for unroll in [2, 3, 4]:
        code += gen_bench_with_timestamp(f, simd, typ, category=category,
                                         unroll=unroll)
    return code

def gen_bench_against_with_timestamp(f, simd, typ, against):
    code = ''
    # "against" dict looks like: { simd: { type: { name: sig } } }
    for s in [simd, '*']:
        if not s in against:
            continue
        for t in [typ, '*']:
            if not t in against[s]:
                continue
            for category, f in against[s][t].items():
                # Allow function to be simple str (you use this most of the
                # time)
                if isinstance(f, str):
                    f = fun_from_sig(f)
                # Now that we have a `Fun` type, we can generate code
                code += gen_bench_with_timestamp(f, simd, typ, category)
    return code

def gen_bench(f, simd, typ):
    ## TODO
    path = gen_filename(f, simd, typ)
    ## Check if we need to create the file
    if not common.can_create_filename(_opts, path):
        return
    ## Generate specific code for the bench
    category = common.nsimd_category(simd)
    code = gen_code(f, simd, typ, category=category)
    if code is None:
        return
    ## Now aggregate every parts
    bench = ''
    #bench += gen_bench_asm_function(f, typ, category)
    bench += gen_bench_against(f, 'cpu', typ, f.bench_against_cpu())
    bench += code
    bench += gen_bench_unrolls(f, simd, typ, category)
    bench += gen_bench_against(f, simd, typ, f.bench_against_libs())
    ## bench_with_timestamp
    bench_with_timestamp = ''
    bench_with_timestamp += 'std::map<std::string, std::pair<' + typ + ', double>> sums;' + '\n'
    bench_with_timestamp += 'size_t const nb_runs = 10 * 1000;' + '\n'
    bench_with_timestamp += gen_bench_against_with_timestamp(f, 'cpu', typ, f.bench_against_cpu())
    bench_with_timestamp += gen_bench_with_timestamp(f, simd, typ, category)
    bench_with_timestamp += gen_bench_unrolls_with_timestamp(f, simd, typ, category)
    bench_with_timestamp += gen_bench_against_with_timestamp(f, simd, typ, f.bench_against_libs())
    bench_with_timestamp += '''
                            std::string json = "";
                            json += "{{\\n";
                            json += "  \\"benchmarks\\": [\\n";

                            for (auto const & bench_name_sum_time : sums) {{
                              std::string const & bench_name = bench_name_sum_time.first;
                              {typ} const & sum = bench_name_sum_time.second.first;
                              double const & elapsed_time_ns = bench_name_sum_time.second.second;

                              json += "  {{" "\\n";
                              json += "    \\"name\\": \\"" + bench_name + "/{typ}\\"," + "\\n";
                              json += "    \\"real_time\\": " + std::to_string(elapsed_time_ns) + "," + "\\n";
                              json += "    \\"sum\\": " + std::string(std::isfinite(sum) ? "" : "\\"") + std::to_string(sum) + std::string(std::isfinite(sum) ? "" : "\\"") + "," + "\\n";
                              json += "    \\"time_unit\\": \\"ns\\"\\n";
                              json += "  }}";
                              if (&bench_name_sum_time != &*sums.rbegin()) {{
                                json += ",";
                              }}
                              json += "\\n";
                            }}

                            json += "  ]\\n";
                            json += "}}\\n";

                            std::cout << json << std::flush;
                            '''.format(typ=typ)
    ## Finalize code
    code = gen_bench_from_code(f, typ, bench, '') # bench_with_timestamp
    ## Write file
    with common.open_utf8(_opts, path) as f:
        f.write(code)
    ## Clang-format it!
    common.clang_format(_opts, path)

# -----------------------------------------------------------------------------
# Entry point

def doit(opts):
    global _opts
    _opts = opts
    common.myprint(opts, 'Generating benches')
    for f in functions.values():
        if not f.do_bench:
            if opts.verbose:
                common.myprint(opts, 'Skipping bench: {}'.format(f.name))
            continue
        # WE MUST GENERATE CODE FOR EACH SIMD EXTENSION AS OTHER LIBRARY
        # USUALLY DO NOT PROPOSE A GENERIC INTERFACE
        for simd in _opts.simd:
            ## FIXME
            if simd in ['neon128', 'cpu']:
                continue
            for typ in f.types:
                ## FIXME
                if typ == 'f16':
                    continue
                ## Skip non-matching benches
                if opts.match and not opts.match.match(f.name):
                    continue
                ## FIXME
                if f.name in ['gamma', 'lgamma', 'ziplo', 'ziphi',
                              'unziphi', 'unziplo']:
                    continue
                gen_bench(f, simd, typ)


================================================
FILE: egg/gen_doc.py
================================================
# Use utf-8 encoding
# -*- coding: utf-8 -*-

# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import platform
import io
import sys
import subprocess
import common
import collections
import operators
import re
import string

categories = operators.categories
operators = operators.operators

# -----------------------------------------------------------------------------
# Get output of command

def get_command_output(args):
    p = subprocess.Popen(args, stdout=subprocess.PIPE)
    lines = p.communicate()[0].split('\n')[0:-1]
    return '\n'.join(['    {}'.format(l) for l in lines])

# -----------------------------------------------------------------------------

def gen_overview(opts):
    filename = common.get_markdown_file(opts, 'overview')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## NSIMD scalar types

Their names follow the following pattern: `Sxx` where

- `S` is `i` for signed integers, `u` for unsigned integer or `f` for
  floatting point number.
- `xx` is the number of bits taken to represent the number.

Full list of scalar types:

''')
        for t in common.types:
            fout.write('- `{}`\n'.format(t))
        fout.write('''
## NSIMD generic SIMD vector types

In NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call
SIMD extension a set of low-level functions and types provided by hardware
vendors to access SIMD units. Examples include SSE2, SSE42, AVX, ...  When
compiling the generic SIMD vector types represents a SIMD register of the
target. Examples are a `__m128` for Intel SSE, `__m512` for Intel AVX-512 or
`svfloat32_t` for Arm SVE.

Their names follow the following pattern:

- C base API: `vSCALAR` where `SCALAR` is a one of scalar type listed above.
- C advanced API: `nsimd_pack_SCALAR` where `SCALAR` is a one of scalar type
  listed above.
- C++ advanced API: `nsimd::pack<SCALAR>` where `SCALAR` is a one of scalar
  type listed above.

Full list of SIMD vector types:

| Base type | C base API | C advanced API | C++ advanced API |
|-----------|------------|----------------|------------------|
''')

        fout.write('\n'.join([
        '| `{typ}` | `v{typ}` | `nsimd_pack_{typ}` | `nsimd::pack<{typ}>` |'. \
        format(typ=typ) for typ in common.types]))

        fout.write('''

## C/C++ base APIs

These come automatically when you include `nsimd/nsimd.h`. You do *not* need
to include a header file for having a function. Here is a list of supported
platforms and their corresponding SIMD extensions.

''')
        platforms = common.get_platforms(opts)
        for p in platforms:
            fout.write('- Platform `{}`\n'.format(p))
            for s in platforms[p].get_simd_exts():
                fout.write('  - `{}`\n'.format(s))
        fout.write('''
Each simd extension has its own set of SIMD types and functions. Types follow
the pattern: `nsimd_SIMDEXT_vSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

There are also logical types associated to each SIMD vector type. These types
are used, for example, to represent the result of a comparison of SIMD vectors.
They are usually bit masks. Their name follow the pattern:
`nsimd_SIMDEXT_vlSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

Note 1: Platform `cpu` is a 128 bits SIMD emulation fallback when no SIMD
extension has been specified or is supported on a given compilation target.

Note 2: as all SIMD extensions of all platforms are different there is no
need to put the name of the platform in each identifier.

Function names follow the pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `FUNCNAME` is the name of a function e.g. `add` or `sub`.
- `SCALAR` is one of scalar types listed above.

### Generic identifier

In the base C API, genericity is achieved using macros.

- `vec(SCALAR)` is a type to represent a SIMD vector containing SCALAR
  elements.  SCALAR must be one of scalar types listed above.
- `vecl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR
  elements. SCALAR must be one of scalar types listed above.
- `vec_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing
  SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar
  types listed above and SIMDEXT must be a valid SIMD extension.
- `vecl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals
  for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of
  scalar types listed above and SIMDEXT must be a valid SIMD extension.
- `vFUNCNAME` takes as input the above types to access the operator FUNCNAME
  e.g. `vadd`, `vsub`.

In C++98 and C++03, type traits are available.

- `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for
  platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD
  extension listed above, SCALAR is one of scalar type listed above.
- `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals
  type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of
  SIMD extensions listed above, SCALAR is one of scalar type listed above.

In C++11 and beyond, type traits are still available but typedefs are also
provided.

- `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`.
- `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`.

The C++20 API does not bring different types for SIMD registers nor other
way to access the other SIMD types. It only brings concepts instead of usual
`typename`s. For more informations cf. <concepts.md>.

Note that all macro and functions available in plain C are still available in
C++.

### List of operators provided by the base APIs

In the documentation we use interchangeably the terms "function" and
"operator".  For each operator FUNCNAME a C function (also available in C++)
named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless
specified otherwise.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is
available and takes as its last argument a SCALAR type.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is
available and takes as its two last argument a SCALAR type and a SIMDEXT.

For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is
available. It takes as its last argument the SCALAR type and can optionnally
take the SIMDEXT as its last last argument.

For example, for the addition of two SIMD vectors `a` and `b` here are the
possibilities:

```c++
c = nsimd_add_avx_f32(a, b); // use AVX
c = nsimd::add(a, b, f32()); // use detected SIMDEXT
c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX
c = vadd(a, b, f32); // use detected SIMDEXT
c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX
```

Here is a list of available FUNCNAME.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_generic(operator.params[0],
                                                     'SCALAR')
            func = operator.name
            args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \
                              ' a' + str(count) for count, p in \
                              enumerate(operator.params[1:])])
            fout.write('- `{} {}({});`  \n'.format(return_typ, func, args))
            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))

        fout.write('''

## C advanced API (only available in C11)

The C advanced API takes advantage of the C11 `_Generic` keyword to provide
function overloading. Unlike the base API described above there is no need to
pass as arguments the base type of the SIMD extension. The informations are
contained in the types provided by this API.

- `nsimd_pack_SCALAR_SIMDEXT` represents a SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT.
- `nsimd::packl_SCALAR_SIMDEXT` represents a SIMD vectors of logicals
  for SCALAR elements of SIMD extension SIMDEXT.

There are versions of the above type without SIMDEXT for which the targeted
SIMD extension is automatically chosen.

- `nsimd_pack_SCALAR` represents a SIMD vectors containing SCALAR elements.
- `nsimd::packl_SCALAR` represents a SIMD vectors of logicals for SCALAR
  elements.

Generic types are also available:

- `nsimd_pack(SCALAR)` is a type to represent a SIMD vector containing SCALAR
  elements.  SCALAR must be one of scalar types listed above.
- `nsimd_packl(SCALAR)` is a type to represent a SIMD vector of logicals for
  SCALAR elements. SCALAR must be one of scalar types listed above.
- `nsimd_pack_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector
  containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one
  of scalar types listed above and SIMDEXT must be a valid SIMD extension.
- `nsimd_packl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of
  logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be
  one of scalar types listed above and SIMDEXT must be a valid SIMD extension.

Finally, operators are follow the naming: `nsimd_FUNCNAME` e.g. `nsimd_add`,
`nsimd_sub`.

## C++ advanced API

The C++ advanced API is called advanced not because it requires C++11 or above
but because it makes use of the particular implementation of ARM SVE by ARM
in their compiler. We do not know if GCC (and possibly MSVC in the distant
future) will use the same approach. Anyway the current implementation allows
us to put SVE SIMD vectors inside some kind of structs that behave like
standard structs. If you want to be sure to write portable code do *not* use
this API. Two new types are available.

- `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT. You can specify only the first
  template argument. The second defaults to 1 while the third defaults to the
  detected SIMDEXT.
- `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical
  type containing SCALAR elements of SIMD extension SIMDEXT. You can specify
  only the first template argument. The second defaults to 1 while the third
  defaults to the detected SIMDEXT.

Use N > 1 when declaring packs to have an unroll of N. This is particularily
useful on ARM.

Functions that takes packs do not take any other argument unless specified
otherwise e.g. the load family of funtions. It is impossible to determine
the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore
in this case, the last argument must be a pack and this same type will then
return. Also some functions are available as C++ operators. They follow the
naming: `nsimd::FUNCNAME`.
''')

# -----------------------------------------------------------------------------

def gen_doc(opts):
    common.myprint(opts, 'Generating doc for each function')

    # Build tree for api.md
    api = dict()
    for _, operator in operators.items():
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    # api.md
    # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md')
    filename = common.get_markdown_file(opts, 'api')
    if common.can_create_filename(opts, filename):
        with common.open_utf8(opts, filename) as fout:
            fout.write('# General API\n\n')
            fout.write('- [Memory function](memory.md)\n')
            fout.write('- [Float16 related functions](fp16.md)\n')
            fout.write('- [Defines provided by NSIMD](defines.md)\n')
            fout.write('- [NSIMD pack and related functions](pack.md)\n\n')
            fout.write('- [NSIMD C++20 concepts](concepts.md)\n\n')
            fout.write('# SIMD operators\n')
            for c, ops in api.items():
                if len(ops) == 0:
                    continue
                fout.write('\n## {}\n\n'.format(c.title))
                for op in ops:
                    Full_name = op.full_name[0].upper() + op.full_name[1:]
                    fout.write('- [{} ({})](api_{}.md)\n'.format(
                        Full_name, op.name, common.to_filename(op.name)))

    # helper to get list of function signatures
    def to_string(var):
        sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \
                     else list(var)
        for i in range(0, len(sigs)):
            sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i])
        return '\n'.join(sigs)

    # Operators (one file per operator)
    # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown')
    dirname = common.get_markdown_dir(opts)
    common.mkdir_p(dirname)
    for op_name, operator in operators.items():
        # Skip non-matching doc
        if opts.match and not opts.match.match(op_name):
            continue
        # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename(
        #                operator.name)))
        filename = common.get_markdown_api_file(opts, operator.name)
        if not common.can_create_filename(opts, filename):
            continue
        Full_name = operator.full_name[0].upper() + operator.full_name[1:]
        with common.open_utf8(opts, filename) as fout:
            fout.write('# {}\n\n'.format(Full_name))
            fout.write('## Description\n\n')
            fout.write(operator.desc)
            fout.write('\n\n## C base API (generic)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_base')))
            fout.write('\n```\n\n')
            fout.write('\n\n## C advanced API (generic, requires C11)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_adv')))
            fout.write('\n```\n\n')
            fout.write('## C++ base API (generic)\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ advanced API\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_adv'). \
                                 values()))
            fout.write('\n```\n\n')
            fout.write('## C base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(operator.get_signature(typ, 'c_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
            fout.write('\n\n## C++ base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(operator.get_signature(typ, 'cxx_base',
                                                      simd_ext))
                    fout.write(';\n')
                fout.write('```')

# -----------------------------------------------------------------------------

def gen_modules_md(opts):
    common.myprint(opts, 'Generating modules.md')
    mods = common.get_modules(opts)
    ndms = []
    for mod in mods:
        name = eval('mods[mod].{}.hatch.name()'.format(mod))
        desc = eval('mods[mod].{}.hatch.desc()'.format(mod))
        ndms.append([name, desc, mod])
    filename = common.get_markdown_file(opts, 'modules')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Modules

NSIMD comes with several additional modules. A module provides a set of
functionnalities that are usually not at the same level as SIMD intrinsics
and/or that do not provide all C and C++ APIs. These functionnalities are
given with the library because they make heavy use of NSIMD core which
abstract SIMD intrinsics. Below is the exhaustive list of modules.

''')
        for ndm in ndms:
            fout.write('- [{}](module_{}_overview.md)  \n'.format(ndm[0],
                                                                  ndm[2]))
            fout.write('\n'.join(['  {}'.format(line.strip()) \
                                  for line in ndm[1].split('\n')]))
            fout.write('\n\n')

# -----------------------------------------------------------------------------

def build_exe_for_doc(opts):
    if not opts.list_files:
        doc_dir = os.path.join(opts.script_dir, '..', 'doc')
        if platform.system() == 'Windows':
            code = os.system('cd {} && nmake /F Makefile.win'. \
                             format(os.path.normpath(doc_dir)))
        else:
            code = os.system('cd {} && make -f Makefile.nix'. \
                             format(os.path.normpath(doc_dir)))
        if code == 0:
            common.myprint(opts, 'Build successful')
        else:
            common.myprint(opts, 'Build failed')

# -----------------------------------------------------------------------------

def gen_what_is_wrapped(opts):
    common.myprint(opts, 'Generating "which intrinsics are wrapped"')
    build_exe_for_doc(opts)
    wrapped = 'what_is_wrapped.exe' if platform.system() == 'Windows' \
                                    else 'what_is_wrapped'
    doc_dir = os.path.join(opts.script_dir, '..', 'doc')
    full_path_wrapped = os.path.join(doc_dir, wrapped)
    if not os.path.isfile(full_path_wrapped):
        common.myprint(opts, '{} not found'.format(wrapped))
        return

    # Content for indexing files created in this function
    index = '# Intrinsics that are wrapped\n'

    # Build command line
    cmd0 = '{} {},{},{},{},{},{}'.format(full_path_wrapped, common.in0,
                                         common.in1, common.in2, common.in3,
                                         common.in4, common.in5)

    # For now we only list Intel, Arm and POWERPC intrinsics
    simd_exts = common.x86_simds + common.arm_simds + common.ppc_simds
    for p in common.get_platforms(opts):
        index_simds = ''
        for simd_ext in opts.platforms_list[p].get_simd_exts():
            if simd_ext not in simd_exts:
                continue
            md = os.path.join(common.get_markdown_dir(opts),
                              'wrapped_intrinsics_for_{}.md'.format(simd_ext))
            index_simds += '- [{}](wrapped_intrinsics_for_{}.md)\n'. \
                           format(simd_ext.upper(), simd_ext)
            ops = [[], [], [], []]
            for op_name, operator in operators.items():
                if operator.src:
                    continue
                c_src = os.path.join(opts.include_dir, p, simd_ext,
                                     '{}.h'.format(op_name))
                ops[operator.output_to].append('{} "{}"'. \
                                               format(op_name, c_src))
            if not common.can_create_filename(opts, md):
                continue
            with common.open_utf8(opts, md) as fout:
                fout.write('# Intrinsics wrapped for {}\n\n'. \
                           format(simd_ext.upper()))
                fout.write('Notations are as follows:\n'
                           '- `T` for trick usually using other intrinsics\n'
                           '- `E` for scalar emulation\n'
                           '- `NOOP` for no operation\n'
                           '- `NA` means the operator does not exist for '
                              'the given type\n'
                           '- `intrinsic` for the actual wrapped intrinsic\n'
                           '\n')
            cmd = '{} {} same {} >> "{}"'.format(cmd0, simd_ext,
                    ' '.join(ops[common.OUTPUT_TO_SAME_TYPE]), md)
            if os.system(cmd) != 0:
                common.myprint(opts, 'Unable to generate markdown for '
                                     '"same"')
                continue

            cmd = '{} {} same_size {} >> "{}"'.format(cmd0, simd_ext,
                    ' '.join(ops[common.OUTPUT_TO_SAME_SIZE_TYPES]), md)
            if os.system(cmd) != 0:
                common.myprint(opts, 'Unable to generate markdown for '
                                     '"same_size"')
                continue

            cmd = '{} {} bigger_size {} >> "{}"'.format(cmd0, simd_ext,
                    ' '.join(ops[common.OUTPUT_TO_UP_TYPES]), md)
            if os.system(cmd) != 0:
                common.myprint(opts, 'Unable to generate markdown for '
                                     '"bigger_size"')
                continue

            cmd = '{} {} lesser_size {} >> "{}"'.format(cmd0, simd_ext,
                    ' '.join(ops[common.OUTPUT_TO_DOWN_TYPES]), md)
            if os.system(cmd) != 0:
                common.myprint(opts, 'Unable to generate markdown for '
                                     '"lesser_size"')
                continue
        if index_simds != '':
            index += '\n## Platform {}\n\n'.format(p)
            index += index_simds

    md = os.path.join(common.get_markdown_dir(opts), 'wrapped_intrinsics.md')
    if common.can_create_filename(opts, md):
        with common.open_utf8(opts, md) as fout:
            fout.write(index)

# -----------------------------------------------------------------------------

def get_html_dir(opts):
    return os.path.join(opts.script_dir, '..', 'doc', 'html')

def get_html_api_file(opts, name, module=''):
    root = get_html_dir(opts)
    op_name = to_filename(name)
    if module == '':
        return os.path.join(root, 'api_{}.html'.format(op_name))
    else:
        return os.path.join(root, 'module_{}_api_{}.html'. \
                                  format(module, op_name))

def get_html_file(opts, name, module=''):
    root = get_html_dir(opts)
    op_name = to_filename(name)
    if module == '':
        return os.path.join(root, '{}.html'.format(op_name))
    else:
        return os.path.join(root, 'module_{}_{}.html'.format(module, op_name))

doc_header = '''\
<!DOCTYPE html>

<html>
  <head>
    <meta charset=\"utf-8\">
    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">
    <title>{}</title>
    <style type=\"text/css\">
      body {{
        /*margin:40px auto;*/
        margin:10px auto;
        /*max-width:650px;*/
        max-width:800px;
        /*line-height:1.6;*/
        line-height:1.4;
        /*font-size:18px;*/
        color:#444;
        padding: 0 10px;
      }}
      h1,h2,h3 {{
        line-height: 1.2;
      }}
      table {{
        border-collapse: collapse;
        border: 0px solid gray;
        width: 100%;
      }}
      th, td {{
        border: 2px solid gray;
        padding: 0px 1em 0px 1em;
      }}
    </style>
    <!-- https://www.mathjax.org/#gettingstarted -->
    <script src=\"assets/polyfill.min.js\"></script>
    <script id=\"MathJax-script\" async src=\"assets/tex-svg.js\"></script>
    <!-- Highlight.js -->
    <link rel=\"stylesheet\" href= \"assets/highlight.js.default.min.css\">
    <script src=\"assets/highlight.min.js\"></script>
    <script src=\"assets/cpp.min.js\"></script>
    <script>hljs.initHighlightingOnLoad();</script>
  </head>
<body>

<div style="text-align: center; margin-bottom: 1em;">
  <img src=\"img/logo.svg\">
  <hr>
</div>
<div style="text-align: center; margin-bottom: 1em;">
  <b>NSIMD documentation</b>
</div>
<div style="text-align: center; margin-bottom: 1em;">
  <a href=\"index.html\">Index</a> |
  <a href=\"tutorial.html\">Tutorial</a> |
  <a href=\"faq.html\">FAQ</a> |
  <a href=\"contribute.html\">Contribute</a> |
  <a href=\"overview.html\">API overview</a> |
  <a href=\"api.html\">API reference</a> |
  <a href=\"wrapped_intrinsics.html\">Wrapped intrinsics</a> |
  <a href=\"modules.html\">Modules</a>
  <hr>
</div>
{}
'''

doc_footer = '''\
  </body>
</html>
'''

def get_html_header(opts, title, filename):
    # check if filename is part of a module doc
    for mod in opts.modules_list:
        if filename.startswith('module_{}_'.format(mod)):
            links = eval('opts.modules_list[mod].{}.hatch.doc_menu()'. \
                         format(mod))
            name = eval('opts.modules_list[mod].{}.hatch.name()'.format(mod))
            html = '<div style="text-align: center; margin-bottom: 1em;">\n'
            html += '<b>{} module documentation</b>\n'.format(name)
            if len(links) > 0:
                html += '</div>\n'
                html += \
                '<div style="text-align: center; margin-bottom: 1em;">\n'
                html += ' | '.join(['<a href=\"module_{}_{}.html\">{}</a>'. \
                                    format(mod, href, label) \
                                    for label, href in links.items()])
            html += '\n<hr>\n</div>\n'
            return doc_header.format(title, html)
    return doc_header.format(title, '')

def get_html_footer():
    return doc_footer

# -----------------------------------------------------------------------------

def gen_doc_html(opts, title):
    if not opts.list_files:
        build_exe_for_doc(opts)
        md2html = 'md2html.exe' if platform.system() == 'Windows' \
                                else 'md2html'
        doc_dir = os.path.join(opts.script_dir, '..', 'doc')
        full_path_md2html = os.path.join(doc_dir, md2html)
        if not os.path.isfile(full_path_md2html):
            common.myprint(opts, '{} not found'.format(md2html))
            return

    # get all markdown files
    md_dir = common.get_markdown_dir(opts)
    html_dir = get_html_dir(opts)

    if not os.path.isdir(html_dir):
        mkdir_p(html_dir)

    doc_files = []
    for filename in os.listdir(md_dir):
        name =  os.path.basename(filename)
        if name.endswith('.md'):
            doc_files.append(os.path.splitext(name)[0])

    if opts.list_files:
        ## list gen files
        for filename in doc_files:
            input_name = os.path.join(md_dir, filename + '.md')
            output_name = os.path.join(html_dir, filename + '.html')
            print(output_name)
    else:
        ## gen html files
        footer = get_html_footer()
        tmp_file = os.path.join(doc_dir, 'tmp.html')
        for filename in doc_files:
            header = get_html_header(opts, title, filename)
            input_name = os.path.join(md_dir, filename + '.md')
            output_name = os.path.join(html_dir, filename + '.html')
            os.system('{} "{}" "{}"'.format(full_path_md2html, input_name,
                                            tmp_file))
            with common.open_utf8(opts, output_name) as fout:
                fout.write(header)
                with io.open(tmp_file, mode='r', encoding='utf-8') as fin:
                    fout.write(fin.read())
                fout.write(footer)

def gen_html(opts):
    common.myprint(opts, 'Generating HTML documentation')
    gen_doc_html(opts, 'NSIMD documentation')

# -----------------------------------------------------------------------------

def copy_github_file_to_doc(opts, github_filename, doc_filename):
    common.myprint(opts, 'Copying {} ---> {}'. \
                   format(github_filename, doc_filename))
    if not common.can_create_filename(opts, doc_filename):
        return
    with io.open(github_filename, mode='r', encoding='utf-8') as fin:
        file_content = fin.read()
    # we replace all links to doc/... by nsimd/...
    file_content = file_content.replace('doc/markdown/', 'nsimd/')
    file_content = file_content.replace('doc/', 'nsimd/')
    # we do not use common.open_utf8 as the copyright is already in content
    with io.open(doc_filename, mode='w', encoding='utf-8') as fout:
        fout.write(file_content)

# -----------------------------------------------------------------------------

def doit(opts):
    gen_overview(opts)
    gen_doc(opts)
    gen_modules_md(opts)
    gen_what_is_wrapped(opts)
    root_dir = os.path.join(opts.script_dir, '..')
    copy_github_file_to_doc(opts,
                            os.path.join(root_dir, 'README.md'),
                            common.get_markdown_file(opts, 'index'))
    copy_github_file_to_doc(opts,
                            os.path.join(root_dir, 'CONTRIBUTING.md'),
                            common.get_markdown_file(opts, 'contribute'))
    gen_html(opts) # This must be last


================================================
FILE: egg/gen_friendly_but_not_optimized.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common
import operators
import os
from datetime import date
import sys

# -----------------------------------------------------------------------------
# Generate advanced C++ API

def get_impl(operator):
    if operator.params == ['v', 'v', 'v'] or \
       operator.params == ['l', 'v', 'v']:
        return \
        '''template <typename T, int N, typename SimdExt, typename S>
        pack{l}<T, N, SimdExt>
        operator{cxx_op}(pack<T, N, SimdExt> const &v, S s) {{
          return {op_name}(v, pack<T, N, SimdExt>(T(s)));
        }}

        template <typename S, typename T, int N, typename SimdExt>
        pack{l}<T, N, SimdExt>
        operator{cxx_op}(S s, pack<T, N, SimdExt> const &v) {{
          return {op_name}(pack<T, N, SimdExt>(T(s)), v);
        }}'''.format(l='l' if operator.params[0] == 'l' else '',
                     cxx_op=operator.cxx_operator, op_name=operator.name)
    if operator.params == ['l', 'l', 'l']:
        return \
        '''template <typename T, int N, typename SimdExt, typename S>
        packl<T, N, SimdExt>
        operator{cxx_op}(packl<T, N, SimdExt> const &v, S s) {{
          return {op_name}(v, packl<T, N, SimdExt>(bool(s)));
        }}

        template <typename S, typename T, int N, typename SimdExt>
        packl<T, N, SimdExt>
        operator{cxx_op}(S s, packl<T, N, SimdExt> const &v) {{
          return {op_name}(pack<T, N, SimdExt>(bool(s)), v);
        }}

        template <typename T, typename S, int N, typename SimdExt>
        packl<T, N, SimdExt> operator{cxx_op}(packl<T, N, SimdExt> const &v,
                                      packl<S, N, SimdExt> const &w) {{
          return {op_name}(v, reinterpretl<packl<T, N, SimdExt> >(w));
        }}'''.format(cxx_op=operator.cxx_operator, op_name=operator.name)

# -----------------------------------------------------------------------------
# Generate advanced C++ API

def doit(opts):
    common.myprint(opts, 'Generating friendly but not optimized advanced '
                         'C++ API')
    filename = os.path.join(opts.include_dir, 'friendly_but_not_optimized.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP
                     #define NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP

                     #include <nsimd/nsimd.h>
                     #include <nsimd/cxx_adv_api.hpp>

                     namespace nsimd {{

                     '''.format(year=date.today().year))
        for op_name, operator in operators.operators.items():
            if operator.cxx_operator == None or len(operator.params) != 3 or \
               operator.name in ['shl', 'shr']:
                continue
            out.write('''{hbar}

                         {code}

                         '''.format(hbar=common.hbar, code=get_impl(operator)))
        out.write('''{hbar}

                     }} // namespace nsimd

                     #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)


================================================
FILE: egg/gen_modules.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import common

def doit(opts):
    mods = common.get_modules(opts)
    for mod in mods:
        exec('mods[mod].{}.hatch.doit(opts)'.format(mod))


================================================
FILE: egg/gen_scalar_utilities.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import common
import operators
import scalar
import cuda
import rocm
import oneapi

# -----------------------------------------------------------------------------

def get_gpu_impl(gpu_sig, cuda_impl, rocm_impl, oneapi_sig, oneapi_impl):
    if cuda_impl == rocm_impl:
        return '''#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)

                  inline {gpu_sig} {{
                    {cuda_impl}
                  }}

                  #elif defined(NSIMD_ONEAPI)

                  inline {oneapi_sig} {{
                    {oneapi_impl}
                  }}

                  #endif'''.format(gpu_sig=gpu_sig, cuda_impl=cuda_impl,
                                   oneapi_sig=oneapi_sig,
                                   oneapi_impl=oneapi_impl)
    else:
        return '''#if defined(NSIMD_CUDA)

                  inline {gpu_sig} {{
                    {cuda_impl}
                  }}

                  #elif defined(NSIMD_ROCM)

                  inline {gpu_sig} {{
                    {rocm_impl}
                  }}

                  #elif defined(NSIMD_ONEAPI)

                  inline {oneapi_sig} {{
                    {oneapi_impl}
                  }}

                  #endif'''.format(gpu_sig=gpu_sig, cuda_impl=cuda_impl,
                                   rocm_impl=rocm_impl, oneapi_sig=oneapi_sig,
                                   oneapi_impl=oneapi_impl)

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Generating scalar implementation for CPU and GPU')
    filename = os.path.join(opts.include_dir, 'scalar_utilities.h')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        # we declare reinterprets now as we need them
        scalar_tmp = []
        gpu_tmp = []
        oneapi_tmp = []
        for t in operators.Reinterpret.types:
            for tt in common.get_output_types(
                          t, operators.Reinterpret.output_to):
                scalar_tmp += [operators.Reinterpret(). \
                               get_scalar_signature('cpu', t, tt, 'c')]
                gpu_tmp += [operators.Reinterpret(). \
                            get_scalar_signature('gpu', t, tt, 'cxx')]
                oneapi_tmp += [operators.Reinterpret(). \
                               get_scalar_signature('oneapi', t, tt, 'cxx')]
        scalar_reinterpret_decls = '\n'.join(['NSIMD_INLINE ' + sig + ';' \
                                              for sig in scalar_tmp])
        gpu_reinterpret_decls = '\n'.join(['inline ' + sig + ';' \
                                           for sig in gpu_tmp])
        oneapi_reinterpret_decls = '\n'.join(['inline ' + sig + ';' \
                                              for sig in oneapi_tmp])
        sleef_decls = ''
        for op in operators.operators.values():
            if 'sleef_symbol_prefix' in op.__class__.__dict__:
                sleef_decls += 'f32 {}_scalar_f32({});\n'. \
                               format(op.sleef_symbol_prefix,
                                      ', '.join(['f32'] * len(op.params[1:])))
                sleef_decls += 'f64 {}_scalar_f64({});\n'. \
                               format(op.sleef_symbol_prefix,
                                      ', '.join(['f64'] * len(op.params[1:])))
        out.write(
        '''#ifndef NSIMD_SCALAR_UTILITIES_H
           #define NSIMD_SCALAR_UTILITIES_H

           #if NSIMD_CXX > 0
           #include <cmath>
           #include <cstring>
           #else
           #include <math.h>
           #include <string.h>
           #endif

           #ifdef NSIMD_NATIVE_FP16
             #if defined(NSIMD_IS_GCC)
               #pragma GCC diagnostic push
               #pragma GCC diagnostic ignored "-Wdouble-promotion"
             #elif defined(NSIMD_IS_CLANG)
               #pragma clang diagnostic push
               #pragma clang diagnostic ignored "-Wdouble-promotion"
             #endif
           #endif

           {hbar}

           #if NSIMD_CXX > 0
           extern "C" {{
           #endif

           {sleef_decls}

           #if NSIMD_CXX > 0
           }} // extern "C"
           #endif

           {hbar}

           {scalar_reinterpret_decls}

           #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \
               defined(NSIMD_ONEAPI)

           namespace nsimd {{

           #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)

           {gpu_reinterpret_decls}

           #elif defined(NSIMD_ONEAPI)

           {oneapi_reinterpret_decls}

           #endif

           }} // namespace nsimd

           #endif
           '''. \
           format(hbar=common.hbar, sleef_decls=sleef_decls,
                  scalar_reinterpret_decls=scalar_reinterpret_decls,
                  gpu_reinterpret_decls=gpu_reinterpret_decls,
                  oneapi_reinterpret_decls=oneapi_reinterpret_decls))
        for op_name, operator in operators.operators.items():
            if not operator.has_scalar_impl:
                continue
            if operator.params == ['l'] * len(operator.params):
                out.write('\n\n' + common.hbar + '\n\n')
                out.write(
                '''NSIMD_INLINE {c_sig} {{
                  {scalar_impl}
                }}

                #if NSIMD_CXX > 0

                namespace nsimd {{

                NSIMD_INLINE {cxx_sig} {{
                  return nsimd_scalar_{op_name}({c_args});
                }}

                {gpu_impl}

                }} // namespace nsimd

                #endif'''.format(
                c_sig=operator.get_scalar_signature('cpu', '', '', 'c'),
                cxx_sig=operator.get_scalar_signature('cpu', '', '', 'cxx'),
                op_name=op_name,
                c_args=', '.join(['a{}'.format(i - 1) \
                               for i in range(1, len(operator.params))]),
                scalar_impl=scalar.get_impl(operator, tt, t),
                gpu_impl=get_gpu_impl(
                    operator.get_scalar_signature('gpu', t, tt, 'cxx'),
                    cuda.get_impl(operator, tt, t),
                    rocm.get_impl(operator, tt, t),
                    operator.get_scalar_signature('oneapi', t, tt, 'cxx'),
                    oneapi.get_impl(operator, tt, t))))
                continue
            for t in operator.types:
                tts = common.get_output_types(t, operator.output_to)
                for tt in tts:
                    out.write('\n\n' + common.hbar + '\n\n')
                    out.write(
                    '''NSIMD_INLINE {c_sig} {{
                      {scalar_impl}
                    }}

                    #if NSIMD_CXX > 0

                    namespace nsimd {{

                    NSIMD_INLINE {cxx_sig} {{
                      return nsimd_scalar_{op_name}_{suffix}({c_args});
                    }}

                    {gpu_impl}

                    }} // namespace nsimd

                    #endif'''.format(
                    c_sig=operator.get_scalar_signature('cpu', t, tt, 'c'),
                    cxx_sig=operator.get_scalar_signature('cpu', t, tt, 'cxx'),
                    op_name=op_name,
                    suffix=t if operator.closed else '{}_{}'.format(tt, t),
                    c_args=', '.join(['a{}'.format(i - 1) \
                                   for i in range(1, len(operator.params))]),
                    scalar_impl=scalar.get_impl(operator, tt, t),
                    gpu_impl=get_gpu_impl(
                        operator.get_scalar_signature('gpu', t, tt, 'cxx'),
                        cuda.get_impl(operator, tt, t),
                        rocm.get_impl(operator, tt, t),
                        operator.get_scalar_signature('oneapi', t, tt, 'cxx'),
                        oneapi.get_impl(operator, tt, t))))

        out.write('''

                  {hbar}

                  #ifdef NSIMD_NATIVE_FP16
                    #if defined(NSIMD_IS_GCC)
                      #pragma GCC diagnostic pop
                    #elif defined(NSIMD_IS_CLANG)
                      #pragma clang diagnostic pop
                    #endif
                  #endif

                  #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)


================================================
FILE: egg/gen_src.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common
import operators
import os
from datetime import date
import sys

# -----------------------------------------------------------------------------
# Implementations for output

def get_put_impl(simd_ext):
    args = {
      'i8' : ['"%d"', '(int)buf[i]'],
      'u8' : ['"%d"', '(int)buf[i]'],
      'i16': ['"%d"', '(int)buf[i]'],
      'u16': ['"%d"', '(int)buf[i]'],
      'i32': ['"%d"', 'buf[i]'],
      'u32': ['"%u"', 'buf[i]'],
      'i64': ['"%lld"', '(nsimd_longlong)buf[i]'],
      'u64': ['"%llu"', '(nsimd_ulonglong)buf[i]'],
      'f16': ['"%e"', '(double)nsimd_f16_to_f32(buf[i])'],
      'f32': ['"%e"', '(double)buf[i]'],
      'f64': ['"%e"', 'buf[i]'],
    }
    ret = '''#ifdef NSIMD_LONGLONG_IS_EXTENSION
               #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
                 #pragma GCC diagnostic ignored "-Wformat"
               #endif
             #endif

             #include <cstdio>

             extern "C" {

             '''
    for typ in common.types:

        fmt = \
        '''NSIMD_DLLEXPORT int NSIMD_VECTORCALL
           nsimd_put_{simd_ext}_{l}{typ}(FILE *out, const char *fmt,
                                         nsimd_{simd_ext}_v{l}{typ} v) {{
             using namespace nsimd;
             {typ} buf[NSIMD_MAX_LEN({typ})];

             int n = len({typ}(), {simd_ext}());
             store{l}u(buf, v, {typ}(), {simd_ext}());
             if (fputs("{{ ", out) == EOF) {{
               return -1;
             }}
             int ret = 2;
             for (int i = 0; i < n; i++) {{
               int code;
               if (fmt != NULL) {{
                 code = fprintf(out, fmt, {val});
               }} else {{
                 code = fprintf(out, {fmt}, {val});
               }}
               if (code < 0) {{
                 return -1;
               }}
               ret += code;
               if (i < n - 1) {{
                 if (fputs(", ", out) == EOF) {{
                   return -1;
                 }}
                 ret += 2;
               }}
             }}
             if (fputs(" }}", out) == EOF) {{
               return -1;
             }}
             return ret + 2;
           }}
           {hbar}
           '''

        ret += fmt.format(typ=typ, l='', simd_ext=simd_ext, hbar=common.hbar,
                          fmt=args[typ][0], val=args[typ][1])
        ret += fmt.format(typ=typ, l='l', simd_ext=simd_ext, hbar=common.hbar,
                          fmt=args[typ][0], val=args[typ][1])
    ret += '} // extern "C"\n'
    return ret

# -----------------------------------------------------------------------------
# Generate base APIs

def write_cpp(opts, simd_ext, emulate_fp16):
    filename = os.path.join(opts.src_dir, 'api_{}.cpp'.format(simd_ext))
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#define NSIMD_INSIDE
                     #include <nsimd/nsimd.h>
                     #include <nsimd/cxx_adv_api.hpp>

                     '''.format(year=date.today().year))
        out.write(get_put_impl(simd_ext))
    common.clang_format(opts, filename)

def doit(opts):
    common.mkdir_p(opts.src_dir)
    common.myprint(opts, 'Generating source for binary')
    opts.platforms = common.get_platforms(opts)
    for platform in opts.platforms:
        mod = opts.platforms[platform]
        for simd_ext in mod.get_simd_exts():
            write_cpp(opts, simd_ext, mod.emulate_fp16(simd_ext))


================================================
FILE: egg/gen_tests.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import math
import sys
import common
import operators
from datetime import date

# -----------------------------------------------------------------------------
# Helper functions

def should_i_do_the_test(operator, tt='', t=''):
    if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes:
        # When converting from float to int to float then we may not
        # get the initial result because of roundings. As tests are usually
        # done by going back and forth then both directions get tested in the
        # end
        return False
    if operator.name == 'reinterpret' and t in common.iutypes and \
       tt in common.ftypes:
        # When reinterpreting from int to float we may get NaN or infinities
        # and no ones knows what this will give when going back to ints
        # especially when float16 are emulated. Again as tests are done by
        # going back and forth both directions get tested in the end.
        return False
    if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \
       t == 'f16':
        # Bit operations on float16 are hard to check because they are
        # emulated in most cases. Therefore going back and forth with
        # reinterprets for doing bitwise operations make the bit in the last
        # place to wrong. This is normal but makes testing real hard. So for
        # now we do not test them on float16.
        return False
    if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail',
                         'loadu', 'loada', 'storeu', 'storea', 'loadla',
                         'loadlu', 'storela', 'storelu', 'if_else1']:
        # These functions are used in almost every tests so we consider
        # that they are extensively tested.
        return False
    if operator.name in ['store2a', 'store2u', 'store3a', 'store3u',
                         'store4a', 'store4u', 'scatter', 'scatter_linear',
                         'downcvt', 'to_logical']:
        # These functions are tested along with their load counterparts.
        # downcvt is tested along with upcvt and to_logical is tested with
        # to_mask
        return False
    return True

# -----------------------------------------------------------------------------
# CBPRNG

def cbprng_impl(typ, domain_, for_cpu, only_int = False):
    code = '((((unsigned int)(1 + i) * 69342380u + 414585u) ' \
           '^ ((unsigned int)(1 + j) * 89375027u + 952905u))' \
           '% 1000000u)'
    def c_code(a0_, a1_):
        if a1_ < a0_:
            raise ValueError("a0 must be lesser than a1")
        if typ in common.utypes and a0_ < 0.0 and a1_ < 0.0:
            raise ValueError("a0 and a1 must be positive")
        if typ in common.ftypes:
            a0 = a0_
            a1 = a1_
        else:
            a0 = 0 if typ in common.utypes and a0_ < 0 else math.ceil(a0_)
            a1 = math.floor(a1_)
        if a1 < a0:
            raise ValueError("a0 and a1 must be positive after filtering")

        if typ in common.iutypes:
            return 'return ({})({} + (f32)((i32){} % {}));'. \
                   format(typ, a0, code, a1 - a0 + 1)
        elif typ == 'f16':
            return \
            'return {}({}(((f32){} + (f32){} * (f32)({}) / 1000000.0f)));'. \
            format('(f16)' if not for_cpu else 'nsimd_f32_to_f16',
                   '(f32)(i32)' if only_int else '', a0, a1 - a0, code)
        elif typ in ['f32', 'f64']:
            return \
            'return {}(({}){} + ({}){} * ({}){} / ({})1000000);'. \
            format('({})({})'.format(typ, 'i' + typ[1:]) if only_int else '',
                   typ, a0, typ, a1 - a0, typ, code, typ)

    if typ not in common.utypes:
        domain = domain_
    domain = []
    for i in range(len(domain_) // 2):
        if domain_[2 * i + 1] > 0:
            domain.append(domain_[2 * i])
            domain.append(domain_[2 * i + 1])
    if len(domain) == 0:
        raise ValueError('domain {} is empty after filtering'.format(domain_))

    nb_intervals = len(domain) // 2
    if nb_intervals == 1:
        return '  {}'.format(c_code(domain[0], domain[1]))
    ret = 'int piece = ((1 + i) * (1 + j)) % {};'.format(nb_intervals)
    for i in range(nb_intervals - 1):
        ret += '\nif (piece == {}) {{\n'.format(i)
        ret += '  {}\n'.format(c_code(domain[2 * i], domain[2 * i + 1]))
        ret += '} else '
    ret += '{\n'
    ret += '  {}\n'.format(c_code(domain[-2], domain[-1]))
    ret += '}'
    return ret

def cbprng(typ, operator, target, gpu_params = None):
    if target not in ['cpu', 'cuda', 'hip', 'oneapi']:
        raise ValueError('Unsupported target, must be cpu, cuda, hip or '
                         'oneapi')

    arity = len(operator.params[1:])
    ret = '{}{} random_impl(int i, int j) {{\n'. \
          format('' if target in ['cpu', 'oneapi'] else '__device__ ', typ)
    for_cpu = (target == 'cpu')

    if arity == 1:
        ret += cbprng_impl(typ, operator.domain[0], for_cpu,
                           operator.tests_on_integers_only)
    else:
        for i in range(arity - 1):
            ret += 'if (j == {}) {{\n  {}\n}} else '. \
                   format(i, cbprng_impl(typ, operator.domain[i], for_cpu,
                                         operator.tests_on_integers_only))
        ret += '{{\n{}\n}} '. \
               format(cbprng_impl(typ, operator.domain[-1],
                                  for_cpu, operator.tests_on_integers_only))
    ret += '\n}\n\n'

    if target == 'cpu':
        ret += '''void random({} *dst, unsigned int n, int j) {{
                    unsigned int i;
                    for (i = 0; i < n; i++) {{
                      dst[i] = random_impl((int)i, j);
                    }}
                  }}'''.format(typ)
    elif target == 'cuda':
        ret += '''__global__ void random_kernel({typ} *dst, int n, int j) {{
                    int i = threadIdx.x + blockIdx.x * blockDim.x;
                    if (i < n) {{
                      dst[i] = random_impl((int)i, j);
                    }}
                  }}

                  void random({typ} *dst, unsigned int n, int j) {{
                    random_kernel<<<{gpu_params}>>>(dst, (int)n, j);
                  }}'''.format(typ=typ, gpu_params=gpu_params)
    elif target == 'hip':
        ret += '''__global__ void random_kernel({typ} *dst, size_t n, int j) {{
                    size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
                    if (i < n) {{
                      dst[i] = random_impl((int)i, j);
                    }}
                  }}

                  void random({typ} *dst, unsigned int n, int j) {{
                    hipLaunchKernelGGL(random_kernel, {gpu_params}, 0, 0,
                                       dst, n, j);
                  }}'''.format(typ=typ, gpu_params=gpu_params)
    elif target == 'oneapi':
        ret += '''inline void random_kernel({typ} *dst, unsigned int n, int j,
                                            sycl::nd_item<1> item) {{
                    size_t i = item.get_global_id().get(0);
                    if (i < n) {{
                      dst[i] = random_impl((int)i, j);
                    }}
                  }}

                  void random({typ} *dst, unsigned int n, int j) {{
                    size_t nt = (size_t)nsimd_kernel_param({n}, {tpb});
                    sycl::queue q_ = nsimd::oneapi::default_queue();
                    q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(nt),
                                    sycl::range<1>({tpb})),
                                    [=](sycl::nd_item<1> item){{
                                      random_kernel(dst, n, j, item);
                                    }}).wait_and_throw();
                  }}'''.format(typ=typ, n=gpu_params[0], tpb=gpu_params[1])
    return ret

# -----------------------------------------------------------------------------

posix_c_source = \
'''#if !defined(_POSIX_C_SOURCE)
   #define _POSIX_C_SOURCE 200112L
   #elif _POSIX_C_SOURCE < 200112L
   #error "_POSIX_C_SOURCE defined by third-party but must be >= 200112L"
   #endif'''

msvc_c4334_warning = \
'''#ifdef NSIMD_IS_MSVC
     // MSVC wrongly emits warning C4333 on the following pieces of code:
     //   (i64)(1 << (rand() % 4))
     //   (u64)(1 << (rand() % 4))
     // so we deactive it for now
     #pragma warning( disable : 4334 )
   #endif'''

# -----------------------------------------------------------------------------
# Get filename for test

def get_filename(opts, op, typ, lang, custom_name=''):
    tests_dir = os.path.join(opts.tests_dir, lang)
    common.mkdir_p(tests_dir)
    ext = { 'c_base': 'prec11.c', 'c_adv': 'c' }
    if not custom_name:
        filename = os.path.join(tests_dir, '{}.{}.{}'.format(op.name, typ,
                     ext[lang] if lang in ['c_base', 'c_adv'] else 'cpp'))
    else:
        filename = os.path.join(tests_dir, '{}_{}.{}.{}'.format(op.name,
                     custom_name, typ,
                     ext[lang] if lang in ['c_base', 'c_adv'] else 'cpp'))
    if common.can_create_filename(opts, filename):
        return filename
    else:
        return None

# -----------------------------------------------------------------------------
# Get standard includes

def get_includes(lang):
    ret = '#include <nsimd/nsimd.h>\n'
    if lang == 'cxx_adv':
        ret += '#include <nsimd/cxx_adv_api.hpp>\n'
    if lang == 'c_adv':
        ret += '#include <nsimd/c_adv_api.h>\n'
    if lang in ['c_base', 'c_adv']:
        ret += '''#include <stdlib.h>
                  #include <stdio.h>
                  #include <errno.h>
                  #include <stdio.h>
                  #include <assert.h>
                  #include <string.h>'''
    else:
        ret += '''#include <cstdlib>
                  #include <cstdio>
                  #include <cerrno>
                  #include <iostream>
                  #include <cassert>
                  #include <cstring>'''
    return ret

# -----------------------------------------------------------------------------
# Function to compute number of common bits between two floatting points
# numbers

distance_int = '''
int distance({typ} a, {typ} b) {{
  {typ} d = (a > b ? a - b : b - a);
  return (int)((u64)d > (u64)INT_MAX) ? (u64)INT_MAX : (u64)d);
}}
'''

distance_float = '''
int distance({typ} a, {typ} b) {{
  if (nsimd_isnan_{typ}(a) && nsimd_isnan_{typ}(b)) {{
    return 0;
  }}

  if (nsimd_isnan_{typ}(a) || nsimd_isnan_{typ}(b)) {{
    return -1;
  }}

  if (nsimd_isinf_{typ}(a) && nsimd_isinf_{typ}(b)) {{
    return 0;
  }}

  if (nsimd_isinf_{typ}(a) || nsimd_isinf_{typ}(b)) {{
    return -1;
  }}

  return nsimd_ufp_{typ}(a, b);
}}

/* ------------------------------------------------------------------------- */
'''

distance = {
  'i8': distance_int.format(typ='i8'),
  'u8': distance_int.format(typ='u8'),
  'i16': distance_int.format(typ='i16'),
  'u16': distance_int.format(typ='u16'),
  'i32': distance_int.format(typ='i32'),
  'u32': distance_int.format(typ='u32'),
  'i64': distance_int.format(typ='i64'),
  'u64': distance_int.format(typ='u64'),
  'f16': distance_float.format(typ='f16'),
  'f32': distance_float.format(typ='f32'),
  'f64': distance_float.format(typ='f64')
}

# -----------------------------------------------------------------------------
# Template for a lot of tests

template = \
'''{includes}

#define SIZE (2048 / {sizeof})

#define STATUS "test of {op_name} over {typ}"

#define CHECK(a) {{ \\
  errno = 0; \\
  if (!(a)) {{ \\
    fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
            __LINE__, strerror(errno)); \\
    fflush(stderr); \\
    exit(EXIT_FAILURE); \\
  }} \\
}}

/* ------------------------------------------------------------------------- */

{extra_code}

int comp_function({typ} ref_out, {typ} nsimd_out)
{{
   {comp};
}}

int main(void) {{
  int vi, i, step;
  {typ} *vout_ref, *vout_nsimd;
  {vin_defi}

  CHECK(vout_ref = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));
  CHECK(vout_nsimd = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));

  step = vlen({typ});

  fprintf(stdout, STATUS "...\\n");
  fflush(stdout);

  /* Fill input vector(s) with random values */
  {vin_rand}

  /* We ensure that inputs are normal numbers */
  for (i = 0; i < SIZE; i++) {{
    {denormalize_inputs}
  }}

  /* Fill vout_ref output vector with reference values */
  for (i = 0; i < SIZE; i += {cpu_step}) {{
    /* This is a call directly to the cpu API of nsimd
       to ensure that we call the scalar version of the
       function */
    {vout_ref_comp}
  }}

  /* Fill vout_nsimd output vector with computed values */
  for (i = 0; i < SIZE; i += step) {{
    {vout_nsimd_comp}
  }}

  {dnz_flush_to_zero}

  /* Compare results */
  for (vi = 0; vi < SIZE; vi += step) {{
    for (i = vi; i < vi + step; i++) {{
      if (comp_function(vout_ref[i], vout_nsimd[i])) {{
        fprintf(stdout, STATUS "... FAIL\\n");
        fflush(stdout);
        return -1;
      }}
    }}
  }}

  fprintf(stdout, STATUS "... OK\\n");
  fflush(stdout);
  return 0;
}}'''

# -----------------------------------------------------------------------------
# Common to most of the tests

def get_content(op, typ, lang):
    cast = 'f32' if typ in ['f16', 'f32'] else 'f64'
    zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ)

    # By default we use emulation functions ("cpu" architecture) for testing
    # in which case increment is given by nsimd_cpu_len()
    cpu_step = 'nsimd_len_cpu_{}()'.format(typ)

    nargs = range(1, len(op.params))

    if typ in common.ftypes:
        code = ['''if (!nsimd_isnormal_{typ}(vin{i}[i])) {{
                     vin{i}[i] = {zero};
                   }}'''.format(typ=typ, i=i, zero=zero) for i in nargs]
        denormalize_inputs = '\n'.join(code)
    else:
        denormalize_inputs = ''

    # Depending on function parameters, generate specific input, ...
    if all(e == 'v' for e in op.params) or all(e == 'l' for e in op.params):
        logical = 'l' if op.params[0] == 'l' else ''

        # Make vin_defi
        code = ['{} *vin{};'.format(typ, i) for i in nargs]
        code += ['CHECK(vin{} = ({}*)nsimd_aligned_alloc(SIZE * {}));'.
                 format(i, typ, common.sizeof(typ)) for i in nargs]
        vin_defi = '\n'.join(code)
        vin_rand = '\n'.join(['random(vin{}, SIZE, {});'.format(i, i - 1) \
                              for i in nargs])

        # Make vout_ref_comp
        args = ', '.join(['va{}'.format(i) for i in nargs])
        code = ['nsimd_cpu_v{}{} {}, vc;'.format(logical, typ, args)]
        code += ['va{} = nsimd_load{}u_cpu_{}(&vin{}[i]);'.
                 format(i, logical, typ, i) for i in nargs]
        code += ['vc = nsimd_{}_cpu_{}({});'.format(op.name, typ, args)]
        code += ['nsimd_store{}u_cpu_{}(&vout_ref[i], vc);'. \
                 format(logical, typ)]
        vout_ref_comp = '\n'.join(code)

        # Make vout_nsimd_comp
        args = ', '.join(['va{}'.format(i) for i in nargs])
        if lang == 'c_base':
            code = ['vec{}({}) {}, vc;'.format(logical, typ, args)]
            code += ['va{} = vload{}u(&vin{}[i], {});'.
                     format(i, logical, i, typ) for i in nargs]
            code += ['vc = v{}({}, {});'.format(op.name, args, typ)]
            code += ['vstore{}u(&vout_nsimd[i], vc, {});'.format(logical, typ)]
            vout_nsimd_comp = '\n'.join(code)
        if lang == 'c_adv':
            code = ['nsimd_pack{}_{} {}, vc;'.format(logical, typ, args)]
            code += ['va{} = nsimd_load{}u(nsimd_pack{}_{}, &vin{}[i]);'.
                     format(i, logical, logical, typ, i) for i in nargs]
            code += ['vc = nsimd_{}({});'.format(op.name, args)]
            code += ['nsimd_store{}u(&vout_nsimd[i], vc);'. \
                     format(logical, typ)]
            vout_nsimd_comp = '\n'.join(code)
        if lang == 'cxx_base':
            code = ['vec{}({}) {}, vc;'.format(logical, typ, args)]
            code += ['va{} = nsimd::load{}u(&vin{}[i], {}());'.
                     format(i, logical, i, typ) for i in nargs]
            code += ['vc = nsimd::{}({}, {}());'.format(op.name, args, typ)]
            code += ['nsimd::store{}u(&vout_nsimd[i], vc, {}());'. \
                     format(logical, typ)]
            vout_nsimd_comp = '\n'.join(code)
        if lang == 'cxx_adv':
            code = ['nsimd::pack{}<{}> {}, vc;'.format(logical, typ, args)]
            code += ['''va{i} = nsimd::load{logical}u<
                                  nsimd::pack{logical}<{typ}> >(
                                      &vin{i}[i]);'''.
                     format(i=i, logical=logical, typ=typ) for i in nargs]
            if op.cxx_operator:
                if len(op.params[1:]) == 1:
                    code += ['vc = {}va1;'.
                             format(op.cxx_operator)]
                if len(op.params[1:]) == 2:
                    code += ['vc = va1 {} va2;'.
                             format(op.cxx_operator)]
            else:
                code += ['vc = nsimd::{}({});'.format(op.name, args)]
            code += ['nsimd::store{}u(&vout_nsimd[i], vc);'. \
                     format(logical, typ)]
            vout_nsimd_comp = '\n'.join(code)
    elif op.params == ['l', 'v', 'v']:
        vin_defi = \
        '''{typ} *vin1, *vin2;
           CHECK(vin1 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));
           CHECK(vin2 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));'''. \
           format(typ=typ, sizeof=common.sizeof(typ))
        code = ['random(vin{}, SIZE, {});'.format(i, i - 1) for i in nargs]
        vin_rand = '\n'.join(code)

        vout_ref_comp = '''nsimd_cpu_v{typ} va1, va2;
                           nsimd_cpu_vl{typ} vc;
                           va1 = nsimd_loadu_cpu_{typ}(&vin1[i]);
                           va2 = nsimd_loadu_cpu_{typ}(&vin2[i]);
                           vc = nsimd_{op_name}_cpu_{typ}(va1, va2);
                           nsimd_storelu_cpu_{typ}(&vout_ref[i], vc);'''. \
                           format(typ=typ, op_name=op.name)

        if lang == 'c_base':
            vout_nsimd_comp = '''vec({typ}) va1, va2;
                                 vecl({typ}) vc;
                                 va1 = vloadu(&vin1[i], {typ});
                                 va2 = vloadu(&vin2[i], {typ});
                                 vc = v{op_name}(va1, va2, {typ});
                                 vstorelu(&vout_nsimd[i], vc, {typ});'''. \
                                 format(typ=typ, op_name=op.name)
        if lang == 'c_adv':
            vout_nsimd_comp = '''nsimd_pack_{typ} va1, va2;
                                 nsimd_packl_{typ} vc;
                                 va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);
                                 va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]);
                                 vc = nsimd_{op_name}(va1, va2);
                                 nsimd_storelu(&vout_nsimd[i], vc);'''. \
                                 format(typ=typ, op_name=op.name)
        if lang == 'cxx_base':
            vout_nsimd_comp = \
            '''vec({typ}) va1, va2;
               vecl({typ}) vc;
               va1 = nsimd::loadu(&vin1[i], {typ}());
               va2 = nsimd::loadu(&vin2[i], {typ}());
               vc = nsimd::{op_name}(va1, va2, {typ}());
               nsimd::storelu(&vout_nsimd[i], vc, {typ}());'''. \
               format(typ=typ, op_name=op.name)
        if lang == 'cxx_adv':
            if op.cxx_operator:
                do_computation = 'vc = va1 {} va2;'. \
                                 format(op.cxx_operator)
            else:
                do_computation = 'vc = nsimd::{}(va1, va2, {}());'. \
                                 format(op.name, typ)
            vout_nsimd_comp = \
            '''nsimd::pack<{typ}> va1, va2;
               nsimd::packl<{typ}> vc;
               va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);
               va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[i]);
               {do_computation}
               nsimd::storelu(&vout_nsimd[i], vc);'''. \
               format(typ=typ, op_name=op.name,
                      do_computation=do_computation)

    elif op.params == ['v', 'v', 'p']:
        vin_defi = \
        '''{typ} *vin1;
           CHECK(vin1 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));'''. \
           format(typ=typ, sizeof=common.sizeof(typ))
        vin_rand = 'random(vin1, SIZE, 0);'
        vout_ref_comp = \
        '''nsimd_cpu_v{typ} va1, vc;
           va1 = nsimd_loadu_cpu_{typ}(&vin1[i]);
           vc = nsimd_{op_name}_cpu_{typ}(va1, (i / step) % {typnbytes});
           nsimd_storeu_cpu_{typ}(&vout_ref[i], vc);'''. \
           format(typ=typ, op_name=op.name, typnbytes=typ[1:])
        if lang == 'c_base':
            vout_nsimd_comp = \
            '''vec({typ}) va1, vc;
               va1 = vloadu(&vin1[i], {typ});
               vc = v{op_name}(va1, (i / step) % {typnbytes}, {typ});
               vstoreu(&vout_nsimd[i], vc, {typ});'''. \
               format(typ=typ, op_name=op.name, typnbytes=typ[1:])
        if lang == 'c_adv':
            vout_nsimd_comp = \
            '''nsimd_pack_{typ} va1, vc;
               va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);
               vc = nsimd_{op_name}(va1, (i / step) % {typnbytes});
               nsimd_storeu(&vout_nsimd[i], vc);'''. \
               format(typ=typ, op_name=op.name, typnbytes=typ[1:])
        if lang == 'cxx_base':
            vout_nsimd_comp = \
            '''vec({typ}) va1, vc;
               va1 = nsimd::loadu(&vin1[i], {typ}());
               vc = nsimd::{op_name}(va1, (i / step) % {typnbytes}, {typ}());
               nsimd::storeu(&vout_nsimd[i], vc, {typ}());'''. \
                       format(typ=typ, op_name=op.name, typnbytes=typ[1:])
        if lang == 'cxx_adv':
            if op.cxx_operator:
                do_computation = 'vc = va1 {} ((i / step) % {typnbytes});'. \
                        format(op.cxx_operator, typnbytes=typ[1:])
            else:
                do_computation = \
                'vc = nsimd::{}(va1, (i / step) % {typnbytes});'. \
                format(op.name, typnbytes=typ[1:])
            vout_nsimd_comp = \
            '''nsimd::pack<{typ}> va1, vc;
               va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);
               {do_computation}
               nsimd::storeu(&vout_nsimd[i], vc);'''. \
               format(typ=typ, do_computation=do_computation)
    else:
        raise ValueError('No test available for operator "{}" on type "{}"'.
                         format(op.name, typ))
    return { 'vin_defi': vin_defi, 'vin_rand': vin_rand, 'cpu_step': cpu_step,
             'vout_ref_comp': vout_ref_comp,
             'vout_nsimd_comp': vout_nsimd_comp,
             'denormalize_inputs': denormalize_inputs }

# -----------------------------------------------------------------------------
# Generate test in C, C++ (base API) and C++ (advanced API) for almost all
# tests

def gen_test(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    content = get_content(op, typ, lang)

    extra_code = cbprng(typ, op, 'cpu')

    if op.name in ['notb', 'andb', 'orb', 'xorb', 'andnotb']:
        comp = 'return nsimd_scalar_reinterpret_{uT}_{typ}(ref_out) != ' \
                      'nsimd_scalar_reinterpret_{uT}_{typ}(nsimd_out)'. \
               format(typ=typ, uT=common.bitfield_type[typ])
    elif op.name in ['max', 'min'] and typ in common.ftypes:
        comp = 'return nsimd_scalar_ne_{}(ref_out, nsimd_out);'.format(typ)
    else:
        if typ in common.ftypes:
            comp = 'return distance(ref_out, nsimd_out) < {}'. \
                   format(op.ufp[typ])
            extra_code += distance[typ]
        else:
            comp = 'return nsimd_scalar_ne_{}(ref_out, nsimd_out);'. \
                   format(typ)

    includes = get_includes(lang)

    if typ in common.ftypes:
        dnz_flush_to_zero = \
        '''/* We flush subnormal numbers to zero because support for it      */
           /* can be disabled, some intrinsics do not support them,          */
           /* execution of 32-bits code on 64-bits system may have different */
           /* ways of handling them. */
           for (i = 0; i < SIZE; i++) {{
             if (!nsimd_isnormal_{typ}(vout_ref[i])) {{
               vout_ref[i] = {zero};
             }}
             if (!nsimd_isnormal_{typ}(vout_nsimd[i])) {{
               vout_nsimd[i] = {zero};
             }}
           }}'''.format(typ=typ, zero='({})0'.format(typ) if typ != 'f16' \
                        else 'nsimd_f32_to_f16(0.0f)')
    else:
        dnz_flush_to_zero = ''

    with common.open_utf8(opts, filename) as out:
        out.write(template.format(
            includes=includes, sizeof=common.sizeof(typ), typ=typ,
            op_name=op.name, year=date.today().year, comp=comp,
            dnz_flush_to_zero=dnz_flush_to_zero,
            extra_code=extra_code, **content))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for addv

def gen_addv(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    if typ == 'f16':
        rand = 'nsimd_f32_to_f16((f32)(rand() % 3) - 1.0f)'
        zero = 'nsimd_f32_to_f16(0.0f)'
        comp = 'nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vref[i])'
    else:
        rand = '({})((int)(rand() % 3) - 1)'.format(typ)
        zero = '({})0'.format(typ)
        comp = 'vout[i] != vref[i]'

    if lang == 'c_base':
        nsimd = 'vaddv(vloada(vin + (i * step), {typ}), {typ})'. \
                format(typ=typ)
    elif lang == 'c_adv':
        nsimd = 'nsimd_addv(nsimd_loada(nsimd_pack_{}, vin + (i * step)))'. \
                format(typ)
    elif lang == 'cxx_base':
        nsimd = 'nsimd::addv(nsimd::loada(vin + (i * step), {}()), {}())'. \
                format(typ, typ)
    elif lang == 'cxx_adv':
        nsimd = 'nsimd::addv(nsimd::loada<nsimd::pack<{}> >' \
                             '(vin + (i * step)))'.format(typ)

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''{posix_c_source}
           {includes}

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           #define STATUS "test of addv over {typ}"

           int main() {{
             int step = vlen({typ});
             int size = 2048;
             int i;
             {typ} *vin, *vref, *vout;

             CHECK(vin = ({typ} *)nsimd_aligned_alloc(size * {sizeof} * step));
             CHECK(vref = ({typ} *)nsimd_aligned_alloc(size * {sizeof}));
             CHECK(vout = ({typ} *)nsimd_aligned_alloc(size * {sizeof}));

             fprintf(stdout, STATUS "...\\n");
             fflush(stdout);

             for (i = 0; i < step * size; i++) {{
               vin[i] = {rand};
             }}

             for (i = 0; i < size; i++) {{
               int j;
               {typ} acc = {zero};
               for (j = step * i; j < step * i + step; j++) {{
                   acc = nsimd_scalar_add_{typ}(acc, vin[j]);
               }}
               vref[i] = acc;
             }}

             for (i = 0; i < size; i++) {{
               vout[i] = {nsimd};
             }}

             for (i = 0; i < size; i++) {{
               if ({comp}) {{
                 fprintf(stdout, STATUS "... FAIL\\n");
                 fflush(stdout);
                 return -1;
               }}
             }}

             fprintf(stdout, STATUS "... OK\\n");
             fflush(stdout);
             return 0;
           }}
           '''.format(typ=typ, sizeof=common.sizeof(typ), zero=zero, rand=rand,
                      comp=comp, nsimd=nsimd, posix_c_source=posix_c_source,
                      includes=get_includes(lang)))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# General tests helpers for adds/subs

def aligned_alloc_error():
      return '''
      #define CHECK(a) \\
      {{ \\
        errno = 0; \\
        if (!(a)) \\
        {{ \\
          fprintf(stderr, \"ERROR: \" #a \":%d: %s\\n\", \\
                __LINE__, strerror(errno)); \\
          fflush(stderr); \\
          exit(EXIT_FAILURE); \\
        }} \\
      }}
      '''

def equal(typ):
      return '''
      int equal({typ} expected_result, {typ} computed_result)
      {{
        return expected_result == computed_result;
      }}
      '''.format(typ=typ)

def adds_subs_check_case():
      return '''
      #define CHECK_CASE(test_output, which_test) \\
      {{ \\
        if(0 == (test_output)) \\
        {{ \\
          fprintf(stdout, STATUS \" ... \" which_test \" check FAIL\\n\"); \\
          fflush(stdout); \\
          return -1; \\
        }} \\
      }}
      '''

def random_sign_flip():
      return '''
      int random_sign_flip(void)
      {{
          return 2 * (rand() % 2) - 1;
      }}
      '''

def zero_out_arrays(typ):
      return '''
      void zero_out_arrays({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                           {typ} vout_computed[])
      {{
        int ii = 0;
        for(ii = 0; ii < SIZE; ++ii)
        {{
           vin1[ii] = ({typ})0;
           vin2[ii] = ({typ})0;
           vout_expected[ii] = ({typ})0;
           vout_computed[ii] = ({typ})0;
        }}
      }}
      '''.format(typ=typ)

def compute_op_given_language(typ, op, language):
      if 'c_base' == language:
            return \
            '''vec({typ}) va1, va2, vc;
               va1 = vloadu(&vin1[outer], {typ});
               va2 = vloadu(&vin2[outer], {typ});
               vc = v{op}(va1, va2, {typ});
               vstoreu(&vout_computed[outer], vc, {typ});'''. \
               format(typ=typ, op=op)
      elif 'c_adv' == language:
            return \
            '''nsimd_pack_{typ} va1, va2, vc;
               va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[outer]);
               va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[outer]);
               vc = nsimd_{op}(va1, va2);
               nsimd_storeu(&vout_computed[outer], vc);'''. \
               format(typ=typ, op=op)
      elif 'cxx_base' == language:
            return \
            '''vec({typ}) va1, va2, vc;
               va1 = nsimd::loadu(&vin1[outer], {typ}());
               va2 = nsimd::loadu(&vin2[outer], {typ}());
               vc = nsimd::{op}(va1, va2, {typ}());
               nsimd::storeu(&vout_computed[outer], vc, {typ}());'''. \
               format(typ=typ, op=op)
      else:
            return \
            '''nsimd::pack<{typ}> va1, va2, vc;
               va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[outer]);
               va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[outer]);
               vc = nsimd::{op}(va1, va2);
               nsimd::storeu(&vout_computed[outer], vc);'''. \
               format(typ=typ, op=op)

def compare_expected_vs_computed(typ, op, language):
      values_computation = compute_op_given_language(typ, op, language)
      return '''
      int compare_expected_vs_computed(const {typ}* vin1, const {typ}* vin2,
                                       const {typ}* vout_expected,
                                       {typ} vout_computed[])
      {{
          const int step = vlen({typ});
          int outer = 0;
          int inner = 0;

          for (outer = 0; outer < SIZE; outer += step) {{
          /* Fill vout_computed with computed values */
          {values_computation}
          /* Compare results */
          for (inner = outer; inner < outer + step; ++inner) {{
              if (! equal(vout_expected[inner], vout_computed[inner])) {{
                return 0;
              }}
            }}
          }}

          return 1;
      }}
      '''.format(typ=typ, values_computation=values_computation)

def test_signed_neither_overflow_nor_underflow(typ, min_, max_, operator,
                                               check):
      return '''
      int test_neither_overflow_nor_underflow({typ} vin1[], {typ} vin2[],
                                              {typ} vout_expected[],
                                              {typ} vout_computed[])
      {{
        int ii = 0;
        while(ii < SIZE)
        {{
          {typ} a = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});
          {typ} b = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});
          if({check}(a, b))
          {{
            vin1[ii] = a;
            vin2[ii] = b;
            vout_expected[ii] = ({typ})(a {operator} b);
            ++ ii;
          }}
        }}
        assert(ii == SIZE);
        /*
        Test:
        if (neither overflow nor underflow) {{
          vout_expected[ii] == a {operator} b;
        }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      '''.format(typ=typ, min_=min_, max_=max_, operator=operator, check=check)

def test_signed_all_cases(typ, min_, max_, oper, oper_is_overflow,
                          oper_is_underflow):
      return '''
      int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                         {typ} vout_computed[])
      {{
        int ii = 0;
        for(ii = 0; ii < SIZE; ++ii)
        {{
          vin1[ii] = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});
          vin2[ii] = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});
          if({oper_is_overflow}(vin1[ii], vin2[ii]))
          {{
            vout_expected[ii] = {max_};
          }}
          else if({oper_is_underflow}(vin1[ii], vin2[ii]))
          {{
            vout_expected[ii] = {min_};
          }}
          else
          {{
            vout_expected[ii] = ({typ})(vin1[ii] {oper} vin2[ii]);
          }}
        }}
        /* Test all cases */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      ''' .format(typ=typ, min_=min_, max_=max_,
                  oper=oper, oper_is_overflow=oper_is_overflow,
                  oper_is_underflow=oper_is_underflow)

# -----------------------------------------------------------------------------
# Tests helpers for adds - is overflow/underflow/neither overflow nor underflow

def adds_is_overflow(typ, max_):
      return '''
      int adds_is_overflow(const {typ} a, const {typ} b)
      {{
        return (a > 0) && (b > {max_} - a);
      }}
      '''.format(typ=typ, max_=max_)

def adds_signed_is_underflow(typ, min_):
      return '''
      int adds_signed_is_underflow(const {typ} a, const {typ} b)
      {{
        return (a < 0) && (b < {min_} - a);
      }}
      '''.format(typ=typ, min_=min_)

def adds_signed_is_neither_overflow_nor_underflow(typ):
      return '''
      int adds_signed_is_neither_overflow_nor_underflow(const {typ} a,
                                                        const {typ} b)
      {{
        return ! adds_is_overflow(a, b) && ! adds_signed_is_underflow(a, b);
      }}
      '''.format(typ=typ)

# -----------------------------------------------------------------------------
# Tests helpers for adds with integer types

# test integer overflow
def test_adds_overflow(typ, max_):
      rand_ = '({typ})rand()'.format(typ=typ) \
              if typ in common.utypes else 'rand()'
      return '''
      int test_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                        {typ} vout_computed[])
      {{
        /* if ((vin1[ii] > 0) && (vin2[ii] > {max_} - vin1[ii])) {{
             overflow
           }} */
        int ii = 0;

        /* vin1[ii] > 0 */
        for(ii = 0; ii < SIZE; ++ii)
        {{
          {typ} rand_val = ({typ})({rand_} % {max_});
          vin1[ii] = (rand_val == 0 ? 1 : rand_val);
        }}

        /*
        vin2[ii] > {max_} - vin1[ii]
        vin2[ii] = {max_} - vin1[ii] + rand_val
        s.t.: 0 < rand_val <= vin1[ii]
        */
        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})({rand_} % (vin1[ii] + 1));
            rand_val = (rand_val == 0 ? 1 : rand_val);
            vin2[ii] = ({typ})({max_} - vin1[ii] + rand_val);
            vout_expected[ii] = {max_};
        }}

        /*
        Test:
        if ((vin1[ii] > 0) && (vin2[ii] > {max_} - vin1[ii])) {{
          vout_expected[ii] == {max_};
        }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
     }}
      '''.format(typ=typ, max_=max_, rand_=rand_)

# -----------------------------------------------------------------------------
# Tests helpers for adds with signed integer types

# test signed underflow
def test_adds_signed_underflow(typ, min_):
      return '''
      int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                         {typ} vout_computed[])
      {{
        /* if ((vin1[ii] < 0) && (vin2[ii] < {min_} - vin1[ii])) {{
             underflow
           }} */
        int ii = 0;

        /* vin1[ii] < 0 */
        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})((- rand()) % {min_});
            vin1[ii] = (rand_val == 0 ? - 1 : rand_val);
        }}

        /*
        vin1[ii] < 0
        vin2[ii] < {min_} - vin1[ii]
        vin2[ii] = {min_} - vin1[ii] - rand_val
        s.t.: 0 < rand_val < - vin1[ii]
        */

        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})((rand()) % (- vin1[ii]));
            rand_val = (rand_val == 0 ? 1 : rand_val);
            vin2[ii] = ({typ})({min_} - vin1[ii] - rand_val);
            vout_expected[ii] = {min_};
        }}

        /*
        Test:
        if ((vin1[ii] < 0) && (vin2[ii] < {min_} - vin1[ii])) {{
          vout_expected[ii] == {min_};
        }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      '''.format(typ=typ, min_=min_)

# test signed neither overflow nor underflow
def test_adds_signed_neither_overflow_nor_underflow(typ, min_, max_):
      return \
        test_signed_neither_overflow_nor_underflow(typ, min_, max_,
         '+', 'adds_signed_is_neither_overflow_nor_underflow')

# test signed all cases
def test_adds_signed_all_cases(typ, min_, max_):
      return test_signed_all_cases(typ, min_, max_, '+', 'adds_is_overflow',
                                   'adds_signed_is_underflow')

# all signed tests
def tests_adds_signed():
      return'''
      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_overflow(vin1, vin2, vout_expected,
                 vout_computed), "overflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_underflow(vin1, vin2, vout_expected,
                 vout_computed), "underflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_neither_overflow_nor_underflow(vin1, vin2,
                 vout_expected, vout_computed),
                 "neither underflow nor overflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,
                 vout_computed), "all cases");
      '''

# -----------------------------------------------------------------------------
# Tests helper for adds with unsigned types

# test signed neither overflow nor underflow
def test_adds_unsigned_no_overflow(typ, max_):
      return '''
      int test_no_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[])
      {{
        int ii = 0;
        while(ii < SIZE)
        {{
          {typ} a = ({typ})(({typ})rand() % {max_});
          {typ} b = ({typ})(({typ})rand() % {max_});
          if(! adds_is_overflow(a, b))
          {{
            vin1[ii] = a;
            vin2[ii] = b;
            vout_expected[ii] = ({typ})(a + b);
            ++ ii;
          }}
        }}
        assert(ii == SIZE);
        /*
        Test:
        if (not adds is overflow) {{ vout_expected[ii] == a + b; }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed);
      }}
      '''.format(typ=typ, max_=max_)

# test unsigned all cases
def test_adds_unsigned_all_cases(typ, max_):
      return '''
      int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[])
      {{
        int ii = 0;
        for(ii = 0; ii < SIZE; ++ii)
        {{
          vin1[ii] = ({typ})(({typ})rand() % {max_});
          vin2[ii] = ({typ})(({typ})rand() % {max_});
          if(adds_is_overflow(vin1[ii], vin2[ii]))
          {{
            vout_expected[ii] = {max_};
          }}
          else {{ vout_expected[ii] = ({typ})(vin1[ii] + vin2[ii]); }}
        }}
        /* Test all cases: */
        return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed);
      }}
      '''.format(typ=typ, max_=max_)

# all unsigned tests
def tests_adds_unsigned():
      return'''
      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_overflow(vin1, vin2, vout_expected,
                 vout_computed), "overflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_no_overflow(vin1, vin2, vout_expected,
                 vout_computed), "no overflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,
                 vout_computed), "all cases");
      '''

# ------------------------------------------------------------------------------
# Get adds tests given type

def get_adds_tests_cases_for_signed_types(typ, min_, max_):
      helpers = '''
            {test_adds_overflow}

            {test_adds_signed_underflow}

            {adds_is_overflow}

            {adds_signed_is_underflow}

            {adds_signed_is_neither_overflow_nor_underflow}

            {test_adds_signed_neither_overflow_nor_underflow}

            {test_adds_signed_all_cases}
          ''' .format(test_adds_overflow=test_adds_overflow(typ, max_),
                      test_adds_signed_underflow=test_adds_signed_underflow(
                          typ, min_),
                      adds_is_overflow=adds_is_overflow(typ, max_),
                      adds_signed_is_underflow=adds_signed_is_underflow(
                          typ, min_),
                      adds_signed_is_neither_overflow_nor_underflow=adds_signed_is_neither_overflow_nor_underflow(
                          typ),
                      test_adds_signed_neither_overflow_nor_underflow=test_adds_signed_neither_overflow_nor_underflow(
                          typ, min_=min_, max_=max_),
                      test_adds_signed_all_cases=test_adds_signed_all_cases(
                          typ, min_=min_, max_=max_)
                      )
      return {'helpers': helpers, 'tests': tests_adds_signed()}

def get_adds_tests_cases_for_unsigned_types(typ, max_):
      helpers = '''
          {test_adds_overflow}

          {adds_is_overflow}

          {test_adds_unsigned_no_overflow}

          {test_adds_unsigned_all_cases}
          ''' .format(test_adds_overflow=test_adds_overflow(typ, max_),
                      adds_is_overflow=adds_is_overflow(typ, max_),
                      test_adds_unsigned_no_overflow=test_adds_unsigned_no_overflow(
                          typ, max_),
                      test_adds_unsigned_all_cases=test_adds_unsigned_all_cases(typ, max_)
                      )
      return {'helpers': helpers, 'tests': tests_adds_unsigned()}

def get_adds_tests_cases_given_type(typ):
      if typ in common.iutypes:
            type_limits = common.limits[typ]
            min_ = type_limits['min']
            max_ = type_limits['max']

            if typ in common.itypes:
                  return get_adds_tests_cases_for_signed_types(typ=typ, min_=min_, max_=max_)

            if typ in common.utypes:
                  return get_adds_tests_cases_for_unsigned_types(typ=typ, max_=max_)
      else:
            msg = '{typ} not implemented'.format(typ=typ)
            raise TypeError(msg)

# -----------------------------------------------------------------------------
# gen_adds

def gen_adds(opts, op, typ, lang):

    # Do not test for floats since adds(floats) == add(floats)
    if typ in common.ftypes:
        return

    filename = get_filename(opts, op, typ, lang)

    if filename == None:
        return

    sizeof = common.sizeof(typ)

    head = '''
              {includes}
              #include <assert.h>

              #define SIZE (2048 / {sizeof})

              #define STATUS "test of {op_name} over {typ}"

              {aligned_alloc_error}

              {adds_subs_check_case}
            ''' .format(includes=get_includes(lang),
                        op_name=op.name,
                        typ=typ,
                        sizeof=sizeof,
                        aligned_alloc_error=aligned_alloc_error(),
                        adds_subs_check_case=adds_subs_check_case())

    with common.open_utf8(opts, filename) as out:
        out.write(
            ''' \
            {head}
            /* ------------------------------------------------------------------------- */

            {random_sign_flip}

            {zero_out_arrays}

            {equal}

            {compare_expected_vs_computed}

            {tests_helpers}

            int main(void)
            {{
              const int mem_aligned_size = SIZE * {sizeof};

              {typ} *vin1;
              {typ} *vin2;

              {typ} *vout_expected;
              {typ} *vout_computed;

              CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));
              CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));

              CHECK(vout_expected = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));
              CHECK(vout_computed = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));

              {tests}

              fprintf(stdout, STATUS "... OK\\n");
              fflush(stdout);
              return EXIT_SUCCESS;
            }}
        ''' .format(head=head,
                    compare_expected_vs_computed=\
                      compare_expected_vs_computed(typ, op.name, lang),
                    random_sign_flip='' if typ in common.utypes \
                                        else random_sign_flip(),
                    zero_out_arrays=zero_out_arrays(typ),
                    equal=equal(typ),
                    tests_helpers=\
                      get_adds_tests_cases_given_type(typ)['helpers'],
                    tests=get_adds_tests_cases_given_type(typ)['tests'],
                    op_name = op.name,
                    typ=typ,
                    sizeof = sizeof)
        )

    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests helpers for subs - is overflow/underflow/neither overflow nor underflow

# subs signed

def subs_signed_is_overflow(typ, max_):
      return '''
      int subs_signed_is_overflow(const {typ} a, const {typ} b)
      {{
        return (b < 0) && (a > {max_} + b);
      }}
      '''.format(typ=typ, max_=max_)

def subs_signed_is_underflow(typ, min_):
      return '''
      int subs_signed_is_underflow(const {typ} a, const {typ} b)
      {{
        return (b > 0) && (a < {min_} + b);
      }}
      '''.format(typ=typ, min_=min_)

def subs_signed_is_neither_overflow_nor_underflow(typ):
      return '''
      int subs_signed_is_neither_overflow_nor_underflow(const {typ} a,
                                                        const {typ} b) {{
        return !subs_signed_is_overflow(a, b) &&
               !subs_signed_is_underflow(a, b);
      }}
      '''.format(typ=typ)

# subs unsigned

def subs_unsigned_is_underflow(typ):
      return '''
      int subs_unsigned_is_underflow(const {typ} a, const {typ} b)
      {{
        return a < b;
      }}
      '''.format(typ=typ)

# -----------------------------------------------------------------------------
# Tests helpers for subs with signed types

# test signed integer overflow
def test_subs_signed_overflow(typ, min_, max_):
      return '''
      int test_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                        {typ} vout_computed[])
      {{
        /*
        if ((vin2[ii] < 0) && (vin1[ii] > {max_} + vin2[ii])) {{
          overflow
        }}
        */
        int ii = 0;

        /* vin2[ii] < 0 */
        for(ii = 0; ii < SIZE; ++ii)
        {{
          {typ} rand_val = ({typ})((- rand()) % {min_});
          vin2[ii] = (rand_val == 0 ? - 1 : rand_val);
        }}

        /*
        vin1[ii] - vin2[ii] > {max_}
        vin1[ii] > {max_} + vin2[ii]
        vin1[ii] = {max_} + vin2[ii] + rand_val
        s.t.: 0 < rand_val <= - vin2[ii]

        (- TYPE_MIN) overflows
        if vin2[ii] == -1 -->  rand() % -(vin2[ii] + 1) --> rand() % 0
        Therefore check if vin2[ii] == -1 --> if True --> set rand_val == 1
        */

        for(ii = 0; ii < SIZE; ++ii)
        {{
          {typ} rand_val = 0;
          if(-1 == vin2[ii]){{ rand_val = 1; }}
          else{{
            rand_val = ({typ})(rand() % -(vin2[ii] + 1));
            rand_val = (rand_val == 0 ? 1 : rand_val);
          }}
            vin1[ii] = ({typ})({max_} + vin2[ii] + rand_val);
            vout_expected[ii] = {max_};
        }}

        /*
        Test:
        if ((vin2[ii] < 0) && (vin1[ii] > {max_} + vin2[ii])) {{
          vout_expected[ii] == {max_};
        }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
     }}
      '''.format(typ=typ, min_=min_, max_=max_)

# test signed underflow
def test_subs_signed_underflow(typ, min_, max_):
      return '''
      int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                         {typ} vout_computed[]) {{
        /*
        if ((vin2[ii] > 0) && (vin1[ii] < {min_} + vin2[ii])) {{
          underflow
        }}
        */
        int ii = 0;

        /* vin2[ii] > 0 */
        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})(rand() % {max_});
            vin2[ii] = (rand_val == 0 ? 1 : rand_val);
        }}

        /*
        vin1[ii] < {min_} + vin2[ii]
        vin1[ii] = {min_} + vin2[ii] - rand_val
        s.t.: 0 < rand_val < vin2[ii]
        */
        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})(rand() % vin2[ii]);
            rand_val = (rand_val == 0 ? 1 : rand_val);
            vin1[ii] = ({typ})({min_} + vin2[ii] - rand_val);
            vout_expected[ii] = {min_};
        }}

        /*
        Test:
        if ((vin2[ii] > 0) && (vin1[ii] < {min_} + vin2[ii])) {{
          vout_expected[ii] == {min_};
        }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      '''.format(typ=typ, min_=min_, max_=max_)

# test signed neither overflow nor underflow
def test_subs_signed_neither_overflow_nor_underflow(typ, min_, max_):
      return \
        test_signed_neither_overflow_nor_underflow(typ, min_, max_,
         '-', 'subs_signed_is_neither_overflow_nor_underflow')

# test signed all cases
def test_subs_signed_all_cases(typ, min_, max_):
      return test_signed_all_cases(typ, min_, max_, '-',
                                   'subs_signed_is_overflow',
                                   'subs_signed_is_underflow')

# all signed tests
def tests_subs_signed():
      return '''
      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_overflow(vin1, vin2, vout_expected,
                 vout_computed), "overflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_underflow(vin1, vin2, vout_expected,
                 vout_computed), "underflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_neither_overflow_nor_underflow(vin1, vin2, vout_expected,
                 vout_computed), "neither underflow nor overflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,
                 vout_computed), "all cases");
      '''

# -----------------------------------------------------------------------------
# Tests helpers for subs with unsigned types

# test unsigned underflow
def test_subs_unsigned_underflow(typ, min_, max_):
      return '''
      int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                         {typ} vout_computed[]) {{
        /* if (vin1[ii] < vin2[ii]) {{ underflow }} */
        int ii = 0;

        /* vin1[ii] */
        for(ii = 0; ii < SIZE; ++ii) {{
          vin1[ii] = ({typ})(({typ})rand() % {max_});
        }}

        /*
        vin1[ii] < vin2[ii]
        vin2[ii] = vin1[ii] + rand_val
        s.t.: 0 < rand_val < {max_} - vin1[ii]
        */
        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})(({typ})rand() % ({max_} - vin1[ii]));
            rand_val = (rand_val == 0 ? 1 : rand_val);
            vin2[ii] = ({typ})(vin1[ii] + rand_val);
            vout_expected[ii] = ({typ}){min_};
        }}

        /*
        Test:
        if (vin1[ii] < vin2[ii]) {{ vout_expected[ii] == {min_}; }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      '''.format(typ=typ, min_=min_, max_=max_)

# test unsigned no underflow
def test_subs_unsigned_no_underflow(typ, max_):
      return '''
      int test_no_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                            {typ} vout_computed[]) {{
        /* if (vin1[ii] >= vin2[ii]) {{ no underflow }} */
        int ii = 0;

        /* vin1[ii] */
        for(ii = 0; ii < SIZE; ++ii) {{
          vin1[ii] = ({typ})(({typ})rand() % {max_});
        }}

        /*
        vin1[ii] >= vin2[ii]
        vin2 = vin1 - rand_val
        s.t. 0 <= rand_val <= vin1
        */

        for(ii = 0; ii < SIZE; ++ii)
        {{
            {typ} rand_val = ({typ})(({typ})rand() % (vin1[ii] + 1));
            vin2[ii] = ({typ})(vin1[ii] - rand_val);
            vout_expected[ii] = ({typ})(vin1[ii] - vin2[ii]);
        }}

        /*
        Test:
        if (vin1[ii] >= vin2[ii]) {{
          vout_expected[ii] == vin1[ii] - vin2[ii];
        }}
        */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      '''.format(typ=typ, max_=max_)

# test signed all cases
def test_subs_unsigned_all_cases(typ, min_, max_):
      return '''
      int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[],
                         {typ} vout_computed[]) {{
        int ii = 0;
        for(ii = 0; ii < SIZE; ++ii)
        {{
          vin1[ii] = ({typ})(({typ})rand() % {max_});
          vin2[ii] = ({typ})(({typ})rand() % {max_});
          if(subs_unsigned_is_underflow(vin1[ii], vin2[ii]))
          {{
            vout_expected[ii] = ({typ}){min_};
          }}
          else {{ vout_expected[ii] = ({typ})(vin1[ii] - vin2[ii]); }}
        }}
        /* Test all cases: */
        return compare_expected_vs_computed(vin1, vin2, vout_expected,
                                            vout_computed);
      }}
      '''.format(typ=typ, min_=min_, max_=max_)

# all unsigned tests
def tests_subs_unsigned():
      return'''
      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_underflow(vin1, vin2, vout_expected,
                 vout_computed), "underflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_no_underflow(vin1, vin2, vout_expected, vout_computed),
      "no underflow");

      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);
      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,
                 vout_computed), "all cases");
      '''

# ------------------------------------------------------------------------------
# Get subs tests given type

def get_subs_tests_cases_for_signed_types(typ, min_, max_):
      helpers = '''
            {test_subs_signed_overflow}

            {test_subs_signed_underflow}

            {subs_signed_is_overflow}

            {subs_signed_is_underflow}

            {subs_signed_is_neither_overflow_nor_underflow}

            {test_subs_signed_neither_overflow_nor_underflow}

            {test_subs_signed_all_cases}
          ''' .format(test_subs_signed_overflow=\
                        test_subs_signed_overflow(typ, min_, max_),
                      test_subs_signed_underflow=\
                        test_subs_signed_underflow(typ, min_, max_),
                      subs_signed_is_overflow=\
                        subs_signed_is_overflow(typ, max_),
                      subs_signed_is_underflow=\
                        subs_signed_is_underflow(typ, min_),
                      subs_signed_is_neither_overflow_nor_underflow=\
                        subs_signed_is_neither_overflow_nor_underflow(typ),
                      test_subs_signed_neither_overflow_nor_underflow=\
                        test_subs_signed_neither_overflow_nor_underflow(
                          typ, min_=min_, max_=max_),
                      test_subs_signed_all_cases=\
                        test_subs_signed_all_cases(typ, min_=min_, max_=max_))
      return {'helpers': helpers, 'tests': tests_subs_signed()}

def get_subs_tests_cases_for_unsigned_types(typ, min_, max_):
      helpers = '''
          {test_subs_unsigned_underflow}

          {test_subs_unsigned_no_underflow}

          {subs_unsigned_is_underflow}

          {test_subs_unsigned_all_cases}
          ''' .format(test_subs_unsigned_underflow=\
                        test_subs_unsigned_underflow(typ, min_, max_),
                      test_subs_unsigned_no_underflow=\
                        test_subs_unsigned_no_underflow(typ, max_),
                      subs_unsigned_is_underflow=\
                        subs_unsigned_is_underflow(typ),
                      test_subs_unsigned_all_cases=\
                        test_subs_unsigned_all_cases(typ, min_, max_))
      return {'helpers': helpers, 'tests': tests_subs_unsigned()}

def get_subs_tests_cases_given_type(typ):
      if typ in common.iutypes:
            type_limits = common.limits[typ]
            min_ = type_limits['min']
            max_ = type_limits['max']

            if typ in common.itypes:
                  return get_subs_tests_cases_for_signed_types(
                             typ=typ, min_=min_, max_=max_)

            if typ in common.utypes:
                  return get_subs_tests_cases_for_unsigned_types(
                             typ=typ, min_=min_, max_=max_)
      else:
            msg = '{typ} not implemented'.format(typ=typ)
            raise TypeError(msg)

# -----------------------------------------------------------------------------
# gen_subs

def gen_subs(opts, op, typ, lang):

    # Do not test for floats since subs(floats) == sub(floats)
    if typ in common.ftypes:
          return

    filename = get_filename(opts, op, typ, lang)

    if filename == None:
        return

    sizeof = common.sizeof(typ)

    head = \
    '''{includes}
       #include <assert.h>

       #define SIZE (2048 / {sizeof})

       #define STATUS "test of {op_name} over {typ}"

       {aligned_alloc_error}

       {adds_subs_check_case}'''. \
       format(includes=get_includes(lang), op_name=op.name, typ=typ,
              sizeof=sizeof, aligned_alloc_error=aligned_alloc_error(),
              adds_subs_check_case=adds_subs_check_case())

    with common.open_utf8(opts, filename) as out:
        out.write('''
        {head}

        {hbar}

        {random_sign_flip}

        {zero_out_arrays}

        {equal}

        {compare_expected_vs_computed}

        {tests_helpers}

        int main(void)
        {{
          const int mem_aligned_size = SIZE * {sizeof};

          {typ} *vin1;
          {typ} *vin2;

          {typ} *vout_expected;
          {typ} *vout_computed;

          CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));
          CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));

          CHECK(vout_expected = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));
          CHECK(vout_computed = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));

          {tests}

          fprintf(stdout, STATUS "... OK\\n");
          fflush(stdout);
          return EXIT_SUCCESS;
        }}
        '''.format(head=head,
                   compare_expected_vs_computed=\
                     compare_expected_vs_computed(typ, op.name, lang),
                   random_sign_flip='' if typ in common.utypes \
                                       else random_sign_flip(),
                   zero_out_arrays=zero_out_arrays(typ),
                   equal=equal(typ),
                   tests_helpers=\
                     get_subs_tests_cases_given_type(typ)['helpers'],
                   tests=get_subs_tests_cases_given_type(typ)['tests'],
                   op_name=op.name, typ=typ, hbar=common.hbar, sizeof=sizeof))

    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for all and any

def gen_all_any(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if lang == 'c_base':
        op_test = 'v{}(vloadla(buf, {}), {})'.format(op.name, typ, typ)
    elif lang == 'c_adv':
        op_test = 'nsimd_{}(nsimd_loadla(nsimd_packl_{}, buf))'. \
                  format(op.name, typ)
    elif lang == 'cxx_base':
        op_test = 'nsimd::{}(nsimd::loadla(buf, {}()), {}())'. \
                  format(op.name, typ, typ)
    else:
        op_test = 'nsimd::{}(nsimd::loadla<nsimd::packl<{}> >(buf))'. \
                  format(op.name, typ)
    if typ == 'f16':
        scalar0 = 'nsimd_f32_to_f16(0)'
        scalar1 = 'nsimd_f32_to_f16(1)'
    else:
        scalar0 = '({})0'.format(typ)
        scalar1 = '({})1'.format(typ)
    with common.open_utf8(opts, filename) as out:
        out.write(
            '''{includes}

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             int i;
             {typ} *buf;
             int len = vlen({typ});

             fprintf(stdout, "test of {op_name} over {typ}...\\n");
             CHECK(buf = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));

             /* Test with all elements to true */
             for (i = 0; i < len; i++) {{
               buf[i] = {scalar1};
             }}
             if (!{op_test}) {{
               exit(EXIT_FAILURE);
             }}

             /* Test with all elements set to false */
             for (i = 0; i < len; i++) {{
               buf[i] = {scalar0};
             }}
             if ({op_test}) {{
               exit(EXIT_FAILURE);
             }}

             /* Test with only one element set to true */
             if (len > 1) {{
               buf[0] = {scalar1};
               if ({notl}{op_test}) {{
                 exit(EXIT_FAILURE);
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, op_test=op_test, year=date.today().year,
                        notl='!' if op.name == 'any' else '', scalar0=scalar0,
                        scalar1=scalar1, sizeof=common.sizeof(typ)))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for load/store of degrees 2, 3 and 4

def gen_load_store(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if op.name.startswith('load'):
        deg = op.name[4]
        align = op.name[5]
    elif op.name.startswith('store'):
        deg = op.name[5]
        align = op.name[6]
    variables = ', '.join(['v.v{}'.format(i) for i in range(0, int(deg))])
    if lang == 'c_base':
        load_store = \
            '''vecx{deg}({typ}) v = vload{deg}{align}(&vin[i], {typ});
               vstore{deg}{align}(&vout[i], {variables}, {typ});'''. \
               format(deg=deg, typ=typ, align=align, variables=variables)
    elif lang == 'c_adv':
        load_store = \
            '''nsimd_packx{deg}_{typ} v =
                   nsimd_load{deg}{align}(nsimd_packx{deg}_{typ}, &vin[i]);
               nsimd_store{deg}{align}(&vout[i], {variables});'''. \
               format(deg=deg, typ=typ, align=align, variables=variables)
    elif lang == 'cxx_base':
        load_store = \
            '''vecx{deg}({typ}) v = nsimd::load{deg}{align}(&vin[i], {typ}());
               nsimd::store{deg}{align}(&vout[i], {variables}, {typ}());'''. \
               format(deg=deg, typ=typ, align=align, variables=variables)
    else:
        load_store = \
            '''nsimd::packx{deg}<{typ}> v = nsimd::load{deg}{align}<
                                          nsimd::packx{deg}<{typ}> >(&vin[i]);
               nsimd::store{deg}{align}(&vout[i], {variables});'''. \
               format(deg=deg, typ=typ, align=align, variables=variables)
    if typ == 'f16':
        rand = '*((u16*)vin + i) = nsimd_f32_to_u16((float)(rand() % 10));'
        comp = '*((u16*)vin + i) != *((u16 *)vout + i)'
    else:
        rand = 'vin[i] = ({})(rand() % 10);'.format(typ)
        comp = 'vin[i] != vout[i]'

    if align=='u':
        unalign = '+1'
    else:
        unalign = ''

    with common.open_utf8(opts, filename) as out:
        out.write('''{includes}

        #define SIZE (2048 / {sizeof})

        #define STATUS "test of {op_name} over {typ}"

        #define CHECK(a) {{ \\
          errno = 0; \\
          if (!(a)) {{ \\
            fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                    __LINE__, strerror(errno)); \\
            fflush(stderr); \\
            exit(EXIT_FAILURE); \\
          }} \\
        }}

        int main(void) {{
          int i, vi;
          {typ} *vin, *vout;
          int len = vlen({typ});
          int n = SIZE * {deg} * len;

          fprintf(stdout, "test of {op_name} over {typ}...\\n");
          CHECK(vin = ({typ}*)nsimd_aligned_alloc(
                                n * {sizeof} {unalign}) {unalign});
          CHECK(vout = ({typ}*)nsimd_aligned_alloc(
                                   n * {sizeof} {unalign}) {unalign});

          /* Fill with random data */
          for (i = 0; i < n; i++) {{
            {rand}
          }}

          /* Load and put back data into vout */
          for (i = 0; i < n; i += {deg} * len) {{
            {load_store}
          }}

          /* Compare results */
          for (vi = 0; vi < SIZE; vi += len) {{
            for (i = vi; i < vi + len; i++) {{
              if ({comp}) {{
                fprintf(stdout, STATUS "... FAIL\\n");
                fflush(stdout);
                return -1;
              }}
            }}
          }}

          fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
          return EXIT_SUCCESS;
        }}'''.format(includes=get_includes(lang), op_name=op.name,
                     typ=typ, rand=rand, year=date.today().year, deg=deg,
                     sizeof=common.sizeof(typ), load_store=load_store,
                     comp=comp, unalign=unalign))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for gather/scatter

def gen_gather_scatter(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    ityp = 'i' + typ[1:]

    if lang == 'c_base':
        if op.name == 'gather_linear':
            gather_scatter = '''vscatter_linear(vout + 1, 2, vgather_linear(
                                    vin, 2, {typ}), {typ});'''.format(typ=typ)
        else:
            gather_scatter = \
                '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2,
                                              {ityp}), {ityp});
                   vec({typ}) v = vgather(vin, offsets, {typ});
                   offsets = vadd(offsets, vset1(({ityp})1, {ityp}), {ityp});
                   vscatter(vout, offsets, v, {typ});'''. \
                   format(typ=typ, ityp=ityp)
    elif lang == 'c_adv':
        if op.name == 'gather_linear':
            gather_scatter = \
            '''nsimd_scatter_linear(
                   vout + 1, 2, nsimd_gather_linear(
                     nsimd_pack_{}, vin, 2));'''.format(typ)
        else:
            gather_scatter = \
                '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota(
                       nsimd_pack_{ityp}), nsimd_set1(
                         nsimd_pack_{ityp}, ({ityp})2));
                   nsimd_pack_{typ} v = nsimd_gather(
                       nsimd_pack_{typ}, vin, offsets);
                   offsets = nsimd_add(offsets, nsimd_set1(nsimd_pack_{ityp},
                                                           ({ityp})1));
                   nsimd_scatter(vout, offsets, v);'''. \
                   format(typ=typ, ityp=ityp)
    elif lang == 'cxx_base':
        if op.name == 'gather_linear':
            gather_scatter = '''nsimd::scatter_linear(vout + 1, 2,
                                  nsimd::gather_linear(
                                    vin, 2, {typ}()), {typ}());'''. \
                                    format(typ=typ)
        else:
            gather_scatter = \
            '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()),
                                     nsimd::set1(({ityp})2, {ityp}()),
                                     {ityp}());
               vec({typ}) v = nsimd::gather(vin, offsets, {typ}());
               offsets = nsimd::add(offsets, nsimd::set1(({ityp})1, {ityp}()),
                                    {ityp}());
               nsimd::scatter(vout, offsets, v, {typ}());'''. \
               format(typ=typ, ityp=ityp)
    else:
        if op.name == 'gather_linear':
            gather_scatter = '''nsimd::scatter_linear(vout + 1, 2,
                                  nsimd::gather_linear<nsimd::pack<{typ}> >(
                                      vin, 2));'''.format(typ=typ)
        else:
            gather_scatter = \
            '''typedef nsimd::pack<{typ}> pack;
               typedef nsimd::pack<{ityp}> ipack;
               ipack offsets = nsimd::mul(nsimd::iota<ipack>(),
                               nsimd::set1<ipack>(({ityp})2));
               pack v = nsimd::gather(vin, offsets);
               offsets = nsimd::add(offsets, nsimd::set1<ipack>(({ityp})1));
               nsimd::scatter(vout, offsets, v);'''. \
               format(typ=typ, ityp=ityp)

    if typ == 'f16':
        one = 'nsimd_f32_to_f16(1.0f)'
        zero = 'nsimd_f32_to_f16(0.0f)'
        comp = 'nsimd_f16_to_f32(vout[i]) != 0.0f'
    else:
        one = '({typ})1'.format(typ=typ)
        zero = '({typ})0'.format(typ=typ)
        comp = 'vout[i] != ({typ})0'.format(typ=typ)

    with common.open_utf8(opts, filename) as out:
        out.write(
           '''{includes}

           #define STATUS "test of {op_name} over {typ}"

           int main(void) {{
             int n = 2 * vlen({typ});
             int i;
             {typ} vin[2 * NSIMD_MAX_LEN({typ})];
             {typ} vout[2 * NSIMD_MAX_LEN({typ})];

             fprintf(stdout, "test of {op_name} over {typ}...\\n");

             /* Fill input and output with 0 1 0 1 0 1 ... */
             for (i = 0; i < n; i++) {{
               if ((i % 2) == 1) {{
                 vin[i] = {one};
                 vout[i] = {one};
               }} else {{
                 vin[i] = {zero};
                 vout[i] = {zero};
               }}
             }}

             /* We gather odd offsets elements from vin and put then at even */
             /* offsets. */
             {{
               {gather_scatter}
             }}

             /* Compare results */
             for (i = 0; i < n; i++) {{
               if ({comp}) {{
                 fprintf(stdout, STATUS "... FAIL\\n");
                 fflush(stdout);
                 return -1;
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), ityp=ityp, comp=comp,
                        typ=typ, year=date.today().year, op_name=op.name,
                        gather_scatter=gather_scatter, zero=zero, one=one))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for masked scatter

def gen_mask_scatter(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    ityp = 'i' + typ[1:]

    if typ == 'f16':
        two = 'nsimd_f32_to_f16(2.0f)'
        one = 'nsimd_f32_to_f16(1.0f)'
        zero = 'nsimd_f32_to_f16(0.0f)'
        comp_with_0 = 'nsimd_f16_to_f32(vout[2 * k]) != 0.0f'
        comp_with_1 = 'nsimd_f16_to_f32(vout[2 * k + 1]) != 1.0f'
        comp_with_2 = 'nsimd_f16_to_f32(vout[2 * k]) != 2.0f'
    else:
        two = '({typ})2'.format(typ=typ)
        one = '({typ})1'.format(typ=typ)
        zero = '({typ})0'.format(typ=typ)
        comp_with_0 = 'vout[2 * k] != ({typ})0'.format(typ=typ)
        comp_with_1 = 'vout[2 * k + 1] != ({typ})1'.format(typ=typ)
        comp_with_2 = 'vout[2 * k] != ({typ})2'.format(typ=typ)

    if lang == 'c_base':
        mask_scatter = \
            '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2,
                                          {ityp}), {ityp});
               vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});
               vmask_scatter(mask, vout, offsets, vset1({two}, {typ}),
                             {typ});'''.format(two=two, typ=typ, ityp=ityp)
    if lang == 'c_adv':
        mask_scatter = \
            '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota(
                   nsimd_pack_{ityp}), nsimd_set1(
                     nsimd_pack_{ityp}, ({ityp})2));
               nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(
                   nsimd_pack_{typ}, 0, i);
               nsimd_mask_scatter(mask, vout, offsets, nsimd_set1(
                   nsimd_pack_{typ}, {two}));'''. \
                   format(two=two, typ=typ, ityp=ityp)
    elif lang == 'cxx_base':
        mask_scatter = \
            '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()),
                                     nsimd::set1(({ityp})2, {ityp}()),
                                     {ityp}());
               vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());
               nsimd::mask_scatter(mask, vout, offsets, nsimd::set1(
                                   {two}, {typ}()), {typ}());'''. \
                                   format(two=two, typ=typ, ityp=ityp)
    else:
        mask_scatter = \
            '''typedef nsimd::pack<{typ}> pack;
               typedef nsimd::pack<{ityp}> ipack;
               typedef nsimd::packl<{typ}> packl;
               ipack offsets = nsimd::mul(nsimd::iota<ipack>(),
                               nsimd::set1<ipack>(({ityp})2));
               packl mask = nsimd::mask_for_loop_tail<packl>(0, i);
               nsimd::mask_scatter(mask, vout, offsets,
                                   nsimd::set1<pack>({two}));'''. \
                                   format(two=two, typ=typ, ityp=ityp)

    with common.open_utf8(opts, filename) as out:
        out.write(
           '''{includes}

           #define STATUS "test of {op_name} over {typ}"

           int main(void) {{
             int n = 2 * vlen({typ});
             int i, j, k;
             {typ} vout[2 * NSIMD_MAX_LEN({typ})];

             fprintf(stdout, "test of {op_name} over {typ}...\\n");

             for (i = 0; i < n / 2; i++) {{
               /* Fill output with 0 1 0 1 0 1 ... */
               for (j = 0; j < n; j++) {{
                 vout[j] = (j % 2 == 0 ? {zero} : {one});
               }}

               {{
                 {mask_scatter}
               }}

               /* Check results */
               for (k = 0; k < n / 2; k++) {{
                 if ({comp_with_1}) {{
                   goto error;
                 }}
               }}
               for (k = 0; k < i; k++) {{
                 if ({comp_with_2}) {{
                   goto error;
                 }}
               }}
               for (; k < n / 2; k++) {{
                 if ({comp_with_0}) {{
                   goto error;
                 }}
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             fflush(stdout);
             return EXIT_SUCCESS;

           error:
             fprintf(stdout, STATUS "... FAIL\\n");
             fflush(stdout);
             return EXIT_FAILURE;
           }}'''.format(includes=get_includes(lang), ityp=ityp, two=two,
                        typ=typ, year=date.today().year, op_name=op.name,
                        mask_scatter=mask_scatter, zero=zero, one=one,
                        comp_with_0=comp_with_0, comp_with_2=comp_with_2,
                        comp_with_1=comp_with_1))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for masked gather

def gen_maskoz_gather(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    ityp = 'i' + typ[1:]

    if typ == 'f16':
        three = 'nsimd_f32_to_f16(3.0f)'
        two = 'nsimd_f32_to_f16(2.0f)'
        one = 'nsimd_f32_to_f16(1.0f)'
        zero = 'nsimd_f32_to_f16(0.0f)'
        comp_with_1 = 'nsimd_f16_to_f32(vout[k]) != 1.0f'
        if op.name == 'maskz_gather':
            comp_with_0_or_3 = 'nsimd_f16_to_f32(vout[k]) != 0.0f'
        else:
            comp_with_0_or_3 = 'nsimd_f16_to_f32(vout[k]) != 3.0f'
    else:
        three = '({typ})3'.format(typ=typ)
        two = '({typ})2'.format(typ=typ)
        one = '({typ})1'.format(typ=typ)
        zero = '({typ})0'.format(typ=typ)
        comp_with_1 = 'vout[k] != ({typ})1'.format(typ=typ)
        if op.name == 'maskz_gather':
            comp_with_0_or_3 = 'vout[k] != ({typ})0'.format(typ=typ)
        else:
            comp_with_0_or_3 = 'vout[k] != ({typ})3'.format(typ=typ)

    oz = 'o' if op.name == 'masko_gather' else 'z'

    if lang == 'c_base':
        ta = ', vset1({three}, {typ})'.format(three=three, typ=typ) \
             if op.name == 'masko_gather' else ''
        maskoz_gather = \
            '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2,
                                          {ityp}), {ityp});
               vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});
               vstoreu(vout, vmask{oz}_gather(mask, vin, offsets{ta},
                       {typ}), {typ});'''. \
                       format(typ=typ, ityp=ityp, ta=ta, oz=oz)
    if lang == 'c_adv':
        ta = ', nsimd_set1(nsimd_pack_{typ}, {three})'. \
             format(three=three, typ=typ) if op.name == 'masko_gather' else ''
        maskoz_gather = \
            '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota(
                   nsimd_pack_{ityp}), nsimd_set1(
                       nsimd_pack_{ityp}, ({ityp})2));
               nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(
                                            nsimd_pack_{typ}, 0, i);
               nsimd_storeu(vout, nsimd_mask{oz}_gather(
                   mask, vin, offsets{ta}));'''. \
                   format(typ=typ, ityp=ityp, ta=ta, oz=oz)
    elif lang == 'cxx_base':
        ta = ', nsimd::set1({three}, {typ}())'.format(three=three, typ=typ) \
             if op.name == 'masko_gather' else ''
        maskoz_gather = \
            '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()),
                                     nsimd::set1(({ityp})2, {ityp}()),
                                     {ityp}());
               vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());
               nsimd::storeu(vout, nsimd::mask{oz}_gather(
                   mask, vin, offsets{ta}, {typ}()), {typ}());'''. \
                   format(typ=typ, ityp=ityp, ta=ta, oz=oz)
    else:
        ta = ', nsimd::set1<nsimd::pack<{typ}> >({three})'. \
             format(three=three, typ=typ) if op.name == 'masko_gather' else ''
        maskoz_gather = \
            '''typedef nsimd::pack<{ityp}> ipack;
               typedef nsimd::packl<{typ}> packl;
               ipack offsets = nsimd::mul(nsimd::iota<ipack>(),
                               nsimd::set1<ipack>(({ityp})2));
               packl mask = nsimd::mask_for_loop_tail<packl>(0, i);
               nsimd::storeu(vout, nsimd::mask{oz}_gather(
                   mask, vin, offsets{ta}));'''. \
                   format(ta=ta, oz=oz, typ=typ, ityp=ityp)

    with common.open_utf8(opts, filename) as out:
        out.write(
           '''{includes}

           #define STATUS "test of {op_name} over {typ}"

           int main(void) {{
             int n = 2 * vlen({typ});
             int i, j, k;
             {typ} vin[2 * NSIMD_MAX_LEN({typ})];
             {typ} vout[NSIMD_MAX_LEN({typ})];

             fprintf(stdout, "test of {op_name} over {typ}...\\n");

             for (i = 0; i < n / 2; i++) {{
               /* Fill input with 1 0 1 0 1 0 ... */
               for (j = 0; j < n; j++) {{
                 vin[j] = (j % 2 == 1 ? {zero} : {one});
               }}

               /* Fill output with 2's ... */
               for (j = 0; j < n / 2; j++) {{
                 vout[j] = {two};
               }}

               {{
                 {maskoz_gather}
               }}

               /* Check results */
               for (k = 0; k < i; k++) {{
                 if ({comp_with_1}) {{
                   goto error;
                 }}
               }}
               for (; k < n / 2; k++) {{
                 if ({comp_with_0_or_3}) {{
                   goto error;
                 }}
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             fflush(stdout);
             return EXIT_SUCCESS;

           error:
             fprintf(stdout, STATUS "... FAIL\\n");
             fflush(stdout);
             return EXIT_FAILURE;
           }}'''.format(includes=get_includes(lang), ityp=ityp, two=two,
                        typ=typ, year=date.today().year, op_name=op.name,
                        maskoz_gather=maskoz_gather, zero=zero, one=one,
                        comp_with_0_or_3=comp_with_0_or_3, three=three,
                        comp_with_1=comp_with_1))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for masked loads

def gen_mask_load(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    if typ == 'f16':
        fill_vin = 'vin[i] = nsimd_f32_to_f16((f32)i);'
        m1 = 'nsimd_f32_to_f16(-1.0f)'
        comp1 = 'nsimd_f16_to_f32(vout[j]) != (f32)j'
    else:
        fill_vin = 'vin[i] = ({typ})i;'.format(typ=typ)
        m1 = '({typ})-1'.format(typ=typ)
        comp1 = 'vout[j] != ({typ})j'.format(typ=typ)

    if op.name in ['masko_loada1', 'masko_loadu1']:
        if lang == 'c_base':
            test = \
            '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});
               vec({typ}) other = vset1({m1}, {typ});
               vstoreu(vout, v{op_name}(mask, vin, other, {typ}), {typ});'''. \
               format(typ=typ, op_name=op.name, m1=m1)
        elif lang == 'c_adv':
            test = \
            '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(
                                            nsimd_packl_{typ}, 0, i);
               nsimd_pack_{typ} other = nsimd_set1(nsimd_pack_{typ}, {m1});
               nsimd_storeu(vout, nsimd_{op_name}(mask, vin, other));'''. \
               format(typ=typ, op_name=op.name, m1=m1)
        elif lang == 'cxx_base':
            test = \
            '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());
               vec({typ}) other = nsimd::set1({m1}, {typ}());
               nsimd::storeu(vout, nsimd::{op_name}(
                   mask, vin, other, {typ}()), {typ}());'''. \
                   format(typ=typ, op_name=op.name, m1=m1)
        elif lang == 'cxx_adv':
            test = \
            '''nsimd::packl<{typ}> mask =
                   nsimd::mask_for_loop_tail<nsimd::packl<{typ}> >(0, i);
               nsimd::pack<{typ}> other = nsimd::set1<nsimd::pack<{typ}> >(
                                              {m1});
               nsimd::storeu(vout, nsimd::{op_name}(mask, vin, other));'''. \
               format(typ=typ, op_name=op.name, m1=m1)
        comp2 = 'vout[j] != ({typ})-1'.format(typ=typ) if typ != 'f16' else \
                'nsimd_f16_to_f32(vout[j]) != -1.0f'
    else:
        if lang == 'c_base':
            test = \
            '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});
               vstoreu(vout, v{op_name}(mask, vin, {typ}), {typ});'''. \
               format(typ=typ, op_name=op.name, m1=m1)
        elif lang == 'c_adv':
            test = \
            '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(
                                            nsimd_packl_{typ}, 0, i);
               nsimd_storeu(vout, nsimd_{op_name}(mask, vin));'''. \
               format(typ=typ, op_name=op.name, m1=m1)
        elif lang == 'cxx_base':
            test = \
            '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());
               nsimd::storeu(vout, nsimd::{op_name}(
                   mask, vin, {typ}()), {typ}());'''. \
                   format(typ=typ, op_name=op.name, m1=m1)
        elif lang == 'cxx_adv':
            test = \
            '''nsimd::packl<{typ}> mask =
                   nsimd::mask_for_loop_tail<nsimd::packl<{typ}> >(0, i);
               nsimd::storeu(vout, nsimd::{op_name}(mask, vin));'''. \
               format(typ=typ, op_name=op.name, m1=m1)
        comp2 = 'vout[j] != ({typ})0'.format(typ=typ) if typ != 'f16' else \
                'nsimd_f16_to_f32(vout[j]) != -0.0f'

    if op.name in ['masko_loadu1', 'maskz_loadu1']:
        unalign = '\nvin += 1;'
    else:
        unalign = ''

    with common.open_utf8(opts, filename) as out:
        out.write(
           '''{includes}

           #define STATUS "test of {op_name} over {typ}"

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             int i, j;
             {typ} *vin;
             {typ} vout[NSIMD_MAX_LEN({typ})];
             int len = vlen({typ});

             fprintf(stdout, "test of {op_name} over {typ}...\\n");

             CHECK(vin = ({typ}*)nsimd_aligned_alloc(2 * len));{unalign}

             /* Fill with data */
             for (i = 0; i < len; i++) {{
               {fill_vin}
             }}

             /* Load and put back data into vout */
             for (i = 0; i < len; i++) {{
               {test}

               for (j = 0; j < i; j++) {{
                 if ({comp1}) {{
                   fprintf(stdout, STATUS "... FAIL\\n");
                   fflush(stdout);
                   return -1;
                 }}
               }}
               for (; j < len; j++) {{
                 if ({comp2}) {{
                   fprintf(stdout, STATUS "... FAIL\\n");
                   fflush(stdout);
                   return -1;
                 }}
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, year=date.today().year, test=test,
                        comp1=comp1, comp2=comp2, unalign=unalign,
                        fill_vin=fill_vin))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for masked stores

def gen_mask_store(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return

    if typ == 'f16':
        fill_vout = 'vout[i] = nsimd_f32_to_f16((f32)0);'
        one = 'nsimd_f32_to_f16(1.0f)'
        comp1 = 'nsimd_f16_to_f32(vout[j]) != (f32)1'
        comp2 = 'nsimd_f16_to_f32(vout[j]) != (f32)0'
    else:
        fill_vout = 'vout[i] = ({typ})0;'.format(typ=typ)
        one = '({typ})1'.format(typ=typ)
        comp1 = 'vout[j] != ({typ})1'.format(typ=typ)
        comp2 = 'vout[j] != ({typ})0'.format(typ=typ)

    if lang == 'c_base':
        test = \
        '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});
           v{op_name}(mask, vout, vset1({one}, {typ}), {typ});'''. \
           format(typ=typ, op_name=op.name, one=one)
    elif lang == 'c_adv':
        test = \
        '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(
               nsimd_packl_{typ}, 0, i);
           nsimd_{op_name}(mask, vout, nsimd_set1(
               nsimd_pack_{typ}, {one}));'''. \
               format(typ=typ, op_name=op.name, one=one)
    elif lang == 'cxx_base':
        test = \
        '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());
           nsimd::{op_name}(mask, vout, nsimd::set1({one}, {typ}()),
                            {typ}());'''.format(typ=typ, op_name=op.name,
                                                one=one)
    elif lang == 'cxx_adv':
        test = \
        '''nsimd::packl<{typ}> mask =
               nsimd::mask_for_loop_tail<nsimd::packl<{typ}> >(0, i);
           nsimd::{op_name}(mask, vout,
                            nsimd::set1<nsimd::pack<{typ}> >({one}));'''. \
                            format(typ=typ, op_name=op.name, one=one)

    if op.name == 'mask_storeu1':
        unalign = '\nvout += 1;'
    else:
        unalign = ''

    with common.open_utf8(opts, filename) as out:
        out.write(
           '''{includes}

           #define STATUS "test of {op_name} over {typ}"

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             int i, j;
             {typ} *vout;
             int len = vlen({typ});

             fprintf(stdout, "test of {op_name} over {typ}...\\n");

             CHECK(vout = ({typ}*)nsimd_aligned_alloc({sizeof} * len));{unalign}

             /* Fill vout with zeors */
             for (i = 0; i < len; i++) {{
               {fill_vout}
             }}

             /* Store data into vout */
             for (i = 0; i < len; i++) {{
               {test}

               for (j = 0; j < i; j++) {{
                 if ({comp1}) {{
                   fprintf(stdout, STATUS "... FAIL\\n");
                   fflush(stdout);
                   return -1;
                 }}
               }}
               for (; j < len; j++) {{
                 if ({comp2}) {{
                   fprintf(stdout, STATUS "... FAIL\\n");
                   fflush(stdout);
                   return -1;
                 }}
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, year=date.today().year, test=test,
                        comp1=comp1, comp2=comp2, unalign=unalign,
                        fill_vout=fill_vout, sizeof=common.sizeof(typ)))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests that load/store of degrees 2, 3 and 4 ravels vectors correctly

def gen_load_store_ravel(opts, op, typ, lang):
    # This test only the libs internal, not the API, so we only generate test
    # for c
    filename = get_filename(opts, op, typ, lang, 'ravel')
    if filename == None:
        return

    deg = op.name[4]
    align = op.name[5]

    if typ=='f16':
        convert_to='nsimd_f32_to_f16((f32)'
    else:
        convert_to='({typ})('.format(typ=typ)

    check = '\n'.join(['''
      comp = vset1({convert_to}{i}+1), {typ});
      err = err || vany(vne(v.v{i}, comp, {typ}), {typ});
      '''.format(typ=typ, i=i, convert_to=convert_to) \
      for i in range (0, int(deg))])

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''{includes}

           #define SIZE (2048 / {sizeof})

           #define STATUS "test raveling of {op_name} over {typ}"

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             {typ}* vin;
             {typ}* vout;
             int i;
             int len = vlen({typ});
             int n = {deg} * len;
             int err=0;
             vec({typ}) comp;
             vecx{deg}({typ}) v;

             fprintf(stdout, "test raveling of {op_name} over {typ}...\\n");

             CHECK(vin = ({typ}*)nsimd_aligned_alloc(n * {sizeof}));
             CHECK(vout = ({typ}*)nsimd_aligned_alloc(n * {sizeof}));

             /* Fill in the vectors */
             for (i=0; i<n; ++i) {{
                 vin[i] = {convert_to}(i%{deg}) + 1);
             }}

             /* Load data and check that each vector is correctly filled */
             v = v{op_name}(vin, {typ});

             {check}

             if (err) {{
               fprintf(stdout, STATUS "... FAIL\\n");
               fflush(stdout);
               return -1;
             }}

             fprintf(stdout, "Raveling of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, year=date.today().year, deg=deg,
                        convert_to=convert_to,
                        sizeof=common.sizeof(typ), check=check))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for iota

def gen_iota(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if lang == 'c_base':
        do_iota = 'vstoreu(buf, viota({typ}), {typ});'.format(typ=typ)
    elif lang == 'c_adv':
        do_iota = 'nsimd_storeu(buf, nsimd_iota(nsimd_pack_{typ}));'. \
                  format(typ=typ)
    elif lang == 'cxx_base':
        do_iota = 'nsimd::storeu(buf, nsimd::iota({typ}()), {typ}());'. \
                  format(typ=typ)
    else:
        do_iota = 'nsimd::storeu(buf, nsimd::iota<nsimd::pack<{typ}> >());'. \
                  format(typ=typ)

    if typ == 'f16':
        comp_i = 'nsimd_f16_to_f32(buf[i]) != (f32)i'
    else:
        comp_i = 'buf[i] != ({typ})i'.format(typ=typ)

    with common.open_utf8(opts, filename) as out:
        out.write(
            '''{includes}

           int main(void) {{
             int i;
             {typ} buf[NSIMD_MAX_LEN({typ})];
             int len = vlen({typ});

             fprintf(stdout, "test of {op_name} over {typ}...\\n");

             {do_iota}

             for (i = 0; i < len; i++) {{
               if ({comp_i}) {{
                 exit(EXIT_FAILURE);
               }}
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, do_iota=do_iota, year=date.today().year,
                        comp_i=comp_i))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for nbtrue

def gen_nbtrue(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if lang == 'c_base':
        nbtrue = 'vnbtrue(vloadla(buf, {}), {})'.format(typ, typ)
    elif lang == 'c_adv':
        nbtrue = 'nsimd_nbtrue(nsimd_loadla(nsimd_packl_{}, buf))'.format(typ)
    elif lang == 'cxx_base':
        nbtrue = 'nsimd::nbtrue(nsimd::loadla(buf, {}()), {}())'. \
                 format(typ, typ)
    else:
        nbtrue = 'nsimd::nbtrue(nsimd::loadla<nsimd::packl<{}> >(buf))'. \
                 format(typ)
    if typ == 'f16':
        scalar0 = 'nsimd_f32_to_f16(0)'
        scalar1 = 'nsimd_f32_to_f16(1)'
    else:
        scalar0 = '({})0'.format(typ)
        scalar1 = '({})1'.format(typ)
    with common.open_utf8(opts, filename) as out:
        out.write(
            '''{includes}

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             int i;
             {typ} *buf;
             int len = vlen({typ});

             fprintf(stdout, "test of {op_name} over {typ}...\\n");
             CHECK(buf = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));

             /* Test with all elements to true */
             for (i = 0; i < len; i++) {{
               buf[i] = {scalar1};
             }}
             if ({nbtrue} != len) {{
               exit(EXIT_FAILURE);
             }}

             /* Test with all elements to false */
             for (i = 0; i < len; i++) {{
               buf[i] = {scalar0};
             }}
             if ({nbtrue} != 0) {{
               exit(EXIT_FAILURE);
             }}

             /* Test with only one element to true */
             buf[0] = {scalar1};
             if ({nbtrue} != 1) {{
               exit(EXIT_FAILURE);
             }}

             fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, nbtrue=nbtrue, year=date.today().year,
                        notl='!' if op.name == 'any' else '', scalar0=scalar0,
                        scalar1=scalar1, sizeof=common.sizeof(typ)))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Tests for reinterprets and converts


def gen_reinterpret_convert(opts, op, from_typ, to_typ, lang):
    filename = get_filename(opts, op, '{}_to_{}'.format(from_typ, to_typ),
                            lang)
    if filename == None:
        return
    logical = 'l' if op.name == 'reinterpretl' or op.name == 'to_mask' else ''
    if lang == 'c_base':
        if op.name == 'upcvt':
            comp = '''{{
                        vecx2({to_typ}) tmp =
                          vupcvt(vload{logical}a(in, {from_typ}),
                                                 {from_typ}, {to_typ});
                        vstore{logical}a(out, vdowncvt(
                            tmp.v0, tmp.v1, {to_typ}, {from_typ}),
                            {from_typ});
                      }}'''.format(op_name=op.name, from_typ=from_typ,
                                   to_typ=to_typ, logical=logical)
        elif op.name == 'to_mask':
            comp = '''vstorela(out, vto_logical(vto_mask(vloadla(in, {typ}),
                               {typ}), {typ}), {typ});'''.format(typ=from_typ)
        else:
            comp = '''vstore{logical}a(out, v{op_name}(v{op_name}(
                        vload{logical}a(in, {from_typ}), {from_typ}, {to_typ}),
                          {to_typ}, {from_typ}), {from_typ});'''. \
                          format(op_name=op.name, from_typ=from_typ,
                                 to_typ=to_typ, logical=logical)
    elif lang == 'c_adv':
        if op.name == 'upcvt':
            comp = '''{{
                        nsimd_packx2_{to_typ} tmp =
                            nsimd_upcvt(nsimd_packx2_{to_typ},
                                nsimd_loada(nsimd_pack_{from_typ}, in));
                        nsimd_storea(out, nsimd_downcvt(
                            nsimd_pack_{from_typ}, tmp.v0, tmp.v1));
                      }}'''.format(op_name=op.name, from_typ=from_typ,
                                   to_typ=to_typ, logical=logical)
        elif op.name == 'to_mask':
            comp = '''nsimd_storela(out, nsimd_to_logical(nsimd_to_mask(
                          nsimd_loadla(nsimd_packl_{typ}, in))));'''. \
                          format(typ=from_typ)
        else:
            comp = \
            '''nsimd_store{logical}a(out, nsimd_{op_name}(
                 nsimd_pack{logical}_{from_typ},
                   nsimd_{op_name}(nsimd_pack{logical}_{to_typ},
                     nsimd_load{logical}a(nsimd_pack{logical}_{from_typ},
                       in))));'''. \
                     format(op_name=op.name, from_typ=from_typ,
                            to_typ=to_typ, logical=logical)
    elif lang == 'cxx_base':
        if op.name == 'upcvt':
            comp = '''vecx2({to_typ}) tmp =
                        nsimd::upcvt(nsimd::load{logical}a(
                            in, {from_typ}()), {from_typ}(), {to_typ}());
                        nsimd::store{logical}a(out, nsimd::downcvt(
                            tmp.v0, tmp.v1, {to_typ}(), {from_typ}()),
                            {from_typ}());'''. \
                            format(op_name=op.name, from_typ=from_typ,
                            to_typ=to_typ, logical=logical)
        elif op.name == 'to_mask':
            comp = '''nsimd::storela(out, nsimd::to_logical(nsimd::to_mask(
                        nsimd::loadla(in, {typ}()), {typ}()), {typ}()),
                          {typ}());'''.format(typ=from_typ)
        else:
            comp = '''nsimd::store{logical}a(out, nsimd::{op_name}(
                        nsimd::{op_name}(nsimd::load{logical}a(
                          in, {from_typ}()), {from_typ}(), {to_typ}()),
                            {to_typ}(), {from_typ}()), {from_typ}());'''. \
                            format(op_name=op.name, from_typ=from_typ,
                                   to_typ=to_typ, logical=logical)
    else:
        if op.name == 'upcvt':
            comp = \
                '''nsimd::packx2<{to_typ}> tmp = nsimd::upcvt<
                 nsimd::pack{logical}x2<{to_typ}> >(nsimd::load{logical}a<
                   nsimd::pack{logical}<{from_typ}> >(in));
               nsimd::store{logical}a(out, nsimd::downcvt<
                 nsimd::pack{logical}<{from_typ}> >(tmp.v0, tmp.v1));'''. \
                 format(op_name=op.name, from_typ=from_typ,
                        to_typ=to_typ, logical=logical)
        elif op.name == 'to_mask':
            comp = '''nsimd::storela(out, nsimd::to_logical(nsimd::to_mask(
                        nsimd::loadla<nsimd::packl<{}> >(in))));'''. \
                        format(from_typ)
        else:
            comp = \
                '''nsimd::store{logical}a(out, nsimd::{op_name}<
                 nsimd::pack{logical}<{from_typ}> >(nsimd::{op_name}<
                   nsimd::pack{logical}<{to_typ}> >(nsimd::load{logical}a<
                     nsimd::pack{logical}<{from_typ}> >(in))));'''. \
                format(op_name=op.name, from_typ=from_typ,
                       to_typ=to_typ, logical=logical)
    if logical == 'l':
        rand = '(rand() % 2)'
    else:
        if op.name == 'reinterpret' and to_typ == 'f16' and \
           from_typ in ['i16', 'u16']:
            rand = '(15360 /* no denormal */ | (1 << (rand() % 4)))'
        else:
            if to_typ in common.utypes or from_typ in common.utypes:
                rand = '(1 << (rand() % 4))'
            else:
                rand = '((2 * (rand() % 2) - 1) * (1 << (rand() % 4)))'
    if from_typ == 'f16':
        rand = 'nsimd_f32_to_f16((f32){});'.format(rand)
        neq_test = '(*(u16*)&in[j]) != (*(u16*)&out[j])'
    else:
        rand = '({}){}'.format(from_typ, rand)
        neq_test = 'in[j] != out[j]'
    with common.open_utf8(opts, filename) as out:
        out.write(
        '''{includes}

           {msvc_c4334_warning}

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             int i, j;
             {from_typ} *in, *out;
             int len = vlen({from_typ});

             fprintf(stdout,
                     "test of {op_name} from {from_typ} to {to_typ}...\\n");
             CHECK(in = ({from_typ}*)nsimd_aligned_alloc(len * {sizeof}));
             CHECK(out = ({from_typ}*)nsimd_aligned_alloc(len * {sizeof}));

             for (i = 0; i < 100; i++) {{
               for (j = 0; j < len; j++) {{
                 in[j] = {rand};
               }}

               {comp}

               for (j = 0; j < len; j++) {{
                 if ({neq_test}) {{
                   exit(EXIT_FAILURE);
                 }}
               }}
             }}

             fprintf(stdout,
                     "test of {op_name} from {from_typ} to {to_typ}... OK\\n");
             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        to_typ=to_typ, from_typ=from_typ, comp=comp,
                        year=date.today().year, rand=rand, neq_test=neq_test,
                        sizeof=common.sizeof(from_typ),
                        msvc_c4334_warning=msvc_c4334_warning \
                        if from_typ in ['i64', 'u64'] else ''))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Shuffle


def gen_reverse(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if lang == 'c_base':
        test_code = \
        'vstorea(out, vreverse(vloada(in, {typ}), {typ}), {typ});'. \
        format(typ=typ)
    elif lang == 'c_adv':
        test_code = '''nsimd_storea(out, nsimd_reverse(nsimd_loada(
                         nsimd_pack_{typ}, in)));'''.format(typ=typ)
    elif lang == 'cxx_base':
        test_code = \
        'nsimd::storea(out, nsimd::reverse(nsimd::loada(in, {typ}()), ' \
        '{typ}()), {typ}());'.format(typ=typ)
    elif lang == 'cxx_adv':
        test_code = \
        'nsimd::storea(out, nsimd::reverse(' \
        'nsimd::loada<nsimd::pack<{typ}> >(in)));'.format(typ=typ)
    if typ == 'f16':
        init = 'in[ i ] = nsimd_f32_to_f16((float)(i + 1));'
        comp = 'ok &= nsimd_f16_to_f32(out[len - 1 - i]) == ' \
               'nsimd_f16_to_f32(in[i]);'
    else:
        init = 'in[ i ] = ({typ})(i + 1);'.format(typ=typ)
        comp = 'ok &= out[len - 1 - i] == in[i];'

    with common.open_utf8(opts, filename) as out:
        out.write(
            '''{includes}

           #define CHECK(a) {{ \\
             errno = 0; \\
             if (!(a)) {{ \\
               fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                       __LINE__, strerror(errno)); \\
               fflush(stderr); \\
               exit(EXIT_FAILURE); \\
             }} \\
           }}

           int main(void) {{
             unsigned char i;
             int ok;
             {typ} * in;
             {typ} * out;

             int len = vlen({typ});

             fprintf(stdout, "test of {op_name} over {typ}...\\n");
             CHECK(in = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));
             CHECK(out = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));

             for( i = 0 ; i < len ; ++i )
             {{
                 {init}
             }}

             {test_code}

             ok = 1;

             for( i = 0 ; i < len ; ++i )
             {{
               {comp}
             }}

             if( ok )
             {{
               fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
             }}
             else
             {{
               fprintf(stderr, "test of {op_name} over {typ}... FAIL\\n");
               exit(EXIT_FAILURE);
             }}

             nsimd_aligned_free( in );
             nsimd_aligned_free( out );

             return EXIT_SUCCESS;
           }}'''.format(includes=get_includes(lang), op_name=op.name,
                        typ=typ, test_code=test_code, year=date.today().year,
                        sizeof=common.sizeof(typ), init=init, comp=comp))

    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Unpack half

def gen_unpack_half(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if typ == 'f16':
        left = '(double)nsimd_f16_to_f32(ref_out)'
        right = '(double)nsimd_f16_to_f32(nsimd_out)'
    elif typ == 'f32':
        left = '(double)ref_out'
        right = '(double)nsimd_out'
    else:
        left = 'ref_out'
        right = 'nsimd_out'

    if lang == 'c_base':
        typ_nsimd = 'vec({typ})'.format(typ=typ)
        vout1_comp = '''vec({typ}) va1, va2, vc;
                        va1 = vloadu(&vin1[i], {typ});
                        va2 = vloadu(&vin2[i], {typ});
                        vc = v{op_name}(va1, va2, {typ});
                        vstoreu(&vout[i], vc, {typ});'''. \
                        format(typ=typ, op_name=op.name)
    if lang == 'c_adv':
        typ_nsimd = 'nsimd_pack_{typ}'.format(typ=typ)
        vout1_comp = '''nsimd_pack_{typ} va1, va2, vc;
                        va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);
                        va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]);
                        vc = nsimd_{op_name}(va1, va2);
                        nsimd_storeu(&vout[i], vc);'''. \
                        format(typ=typ, op_name=op.name)
    if lang == 'cxx_base':
        typ_nsimd = 'vec({typ})'.format(typ=typ)
        vout1_comp = '''vec({typ}) va1, va2, vc;
                        va1 = nsimd::loadu(&vin1[i], {typ}());
                        va2 = nsimd::loadu(&vin2[i], {typ}());
                        vc = nsimd::{op_name}(va1, va2, {typ}());
                        nsimd::storeu(&vout[i], vc, {typ}());'''. \
                        format(typ=typ, op_name=op.name)
    if lang == 'cxx_adv':
        typ_nsimd = 'nsimd::pack<{typ}>'.format(typ=typ)
        vout1_comp = '''nsimd::pack<{typ}> va1, va2, vc;
                        va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);
                        va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[i]);
                        vc = nsimd::{op_name}(va1, va2);
                        nsimd::storeu(&vout[i], vc);'''. \
                        format(typ=typ, op_name=op.name)

    op_test =  'step/(2 * nb_lane)'
    if op.name in['ziphi', 'ziplo']:
        offset = 'int offset = {val};'.format(val= '0' \
                 if op.name == 'ziplo' else 'vlen({typ}) / 2'.format(typ=typ))
    else:
        offset = ''

    if op.name in ['unziplo', 'unziphi']:
        if typ == 'f16':
            comp_unpack = '''
            (nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vin1[vi + 2 * j + {i}]))
            || (nsimd_f16_to_f32(vout[i + step / 2]) != nsimd_f16_to_f32(vin2[vi + 2 * j + {i}]))
            '''.format(i = '0' if op.name == 'unziplo' else '1')
        else:
            comp_unpack =  '''\
            (vout[i] != vin1[vi + 2 * j + {i}])
            || (vout[i + step / 2] != vin2[vi + 2 * j + {i}])
            '''.format(i = '0' if op.name == 'unziplo' else '1')
    else:
        if typ == 'f16':
            comp_unpack ='''(nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vin1[j])) ||
                (nsimd_f16_to_f32(vout[i + 1]) != nsimd_f16_to_f32(vin2[j]))'''
        else:
            comp_unpack ='''(vout[i] != vin1[j]) ||
            (vout[i + 1] != vin2[j])'''

    nbits = {'f16': '10', 'f32': '21', 'f64': '48'}
    head = '''{posix_c_source}

              {includes}
              #include <float.h>
              #include <math.h>

              {msvc_c4334_warning}

              #define SIZE (2048 / {sizeof})

              #define CHECK(a) {{ \\
                errno = 0; \\
                if (!(a)) {{ \\
                fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                        __LINE__, strerror(errno)); \\
                fflush(stderr); \\
                exit(EXIT_FAILURE); \\
                }} \\
              }}

              /* {simd} */

              ''' .format(year=date.today().year, typ=typ,
                          posix_c_source=posix_c_source,
                          includes=get_includes(lang),
                          comp_unpack=comp_unpack,
                          sizeof=common.sizeof(typ), simd=opts.simd,
                          msvc_c4334_warning=msvc_c4334_warning \
                          if typ in ['i64', 'u64'] else '')
    if typ == 'f16':
        rand = '''nsimd_f32_to_f16((f32)(2 * (rand() % 2) - 1) *
        (f32)(1 << (rand() % 4)) /
        (f32)(1 << (rand() % 4)))'''
    else:
        rand = '''({typ})(({typ})(2 * (rand() % 2) - 1) * ({typ})(1 << (rand() % 4))
        / ({typ})(1 << (rand() % 4)))'''.format(typ=typ)

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''{head}

           int main(void) {{
              int vi, i, j, step;
              {typ} *vin1, *vin2;
              {typ} *vout;

              CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));
              CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));
              CHECK(vout = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));

              step = vlen({typ});

              fprintf(stdout, "test of {op_name} over {typ}...\\n");

              /* Fill input vector(s) with random */
              for (i = 0; i < SIZE; i++)
              {{
                vin1[i] = {rand};
                vin2[i] = {rand};
              }}

              /* Fill output vector with computed values */
              for (i = 0; i < SIZE; i += step)
              {{
                {vout1_comp}
              }}

              /* Compare results */
              if (step != 1) {{
                {offset}
                for (vi = 0; vi < SIZE; vi += step){{
                 j = {init_j};
                 for (i = vi; i < {cond}; {inc}) {{
                   if({comp_unpack}) {{
                     fprintf(stderr, "test of {op_name} over {typ}... FAIL\\n");
                     exit(EXIT_FAILURE);
                   }}
                   j++;
                  }}
                }}
              }}

              fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
              fflush(stdout);
              return EXIT_SUCCESS;
            }}
        '''.format(includes=get_includes(lang), op_name=op.name,
            typ=typ, year=date.today().year,sizeof=common.sizeof(typ),
            rand=rand, head=head, comp_unpack=comp_unpack,
            vout1_comp= vout1_comp, op_test=op_test, typ_nsimd=typ_nsimd,
            offset=offset,
            cond='vi + step' if op.name in['ziplo', 'ziphi'] else 'vi + step / 2',
            init_j='vi + offset' if op.name in['ziplo', 'ziphi'] else '0',
            inc='i += 2' if op.name in['ziphi', 'ziplo'] else 'i++',
            pos='0' if op.name in ['ziplo', 'unziplo', 'unziphi'] else op_test))

    common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Unpack

def gen_unpack(opts, op, typ, lang):
    filename = get_filename(opts, op, typ, lang)
    if filename == None:
        return
    if typ == 'f16':
        left = '(double)nsimd_f16_to_f32(ref_out)'
        right = '(double)nsimd_f16_to_f32(nsimd_out)'
    elif typ == 'f32':
        left = '(double)ref_out'
        right = '(double)nsimd_out'
    else:
        left = 'ref_out'
        right = 'nsimd_out'

    if lang == 'c_base':
        typ_nsimd = 'vec({typ})'.format(typ=typ)
        vout1_comp = \
        '''vec({typ}) va1, va2;
           vecx2({typ}) vc;
           va1 = vloadu(&vin1[i], {typ});
           va2 = vloadu(&vin2[i], {typ});
           vc = v{op_name}(va1, va2, {typ});
           vstoreu(&vout[2 * i], vc.v0, {typ});
           vstoreu(&vout[2 * i + vlen({typ})], vc.v1, {typ});'''. \
           format(typ=typ, op_name=op.name)
    if lang == 'c_adv':
        typ_nsimd = 'nsimd_pack_{typ}'.format(typ=typ)
        vout1_comp = \
        '''nsimd_pack_{typ} va1, va2;
           nsimd_packx2_{typ} vc;
           va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);
           va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]);
           vc = nsimd_{op_name}(va1, va2);
           nsimd_storeu(&vout[2 * i], vc.v0);
           nsimd_storeu(&vout[2 * i + nsimd_len(nsimd_pack_{typ})],
                        vc.v1);'''.format(typ=typ, op_name=op.name)
    if lang == 'cxx_base':
        typ_nsimd = 'vec({typ})'.format(typ=typ)
        vout1_comp = \
        '''vec({typ}) va1, va2;
           vecx2({typ}) vc;
           va1 = nsimd::loadu(&vin1[i], {typ}());
           va2 = nsimd::loadu(&vin2[i], {typ}());
           vc = nsimd::{op_name}(va1, va2, {typ}());
           nsimd::storeu(&vout[2 * i], vc.v0, {typ}());
           nsimd::storeu(&vout[2 * i + vlen({typ})], vc.v1, {typ}());'''. \
           format(typ=typ, op_name=op.name)
    if lang == 'cxx_adv':
        typ_nsimd = 'nsimd::pack<{typ}>'.format(typ=typ)
        vout1_comp = \
        '''nsimd::pack<{typ}> va1, va2;
           nsimd::packx2<{typ}> vc;
           va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);
           va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[i]);
           vc = nsimd::{op_name}(va1, va2);
           nsimd::storeu(&vout[2 * i], vc.v0);
           nsimd::storeu(&vout[2 * i + nsimd::len({typ}())], vc.v1);'''. \
           format(typ=typ, op_name=op.name)

    head = '''{posix_c_source}

              {includes}
              #include <float.h>
              #include <math.h>

              {msvc_c4334_warning}

              #define SIZE (2048 / {sizeof})

              #define CHECK(a) {{ \\
                errno = 0; \\
                if (!(a)) {{ \\
                  fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\
                          __LINE__, strerror(errno)); \\
                  fflush(stderr); \\
                  exit(EXIT_FAILURE); \\
                }} \\
              }}

              /* {simd} */
              ''' .format(year=date.today().year, typ=typ,
                          posix_c_source=posix_c_source,
                          includes=get_includes(lang),
                          sizeof=common.sizeof(typ), simd= opts.simd,
                          msvc_c4334_warning=msvc_c4334_warning \
                          if typ in ['i64', 'u64'] else '')

    if typ == 'f16':
        rand = 'nsimd_f32_to_f16((f32)(2 * (rand() % 2) - 1) * ' \
               '(f32)(1 << (rand() % 4)) / (f32)(1 << (rand() % 4)))'
    else:
        rand = '({typ})(({typ})(2 * (rand() % 2) - 1) * ' \
               '({typ})(1 << (rand() % 4)) / ({typ})(1 << (rand() % 4)))'. \
               format(typ=typ)

    if op.name == 'zip':
        scalar_code = '''for(i = 0; i < step; i ++) {{
                           out_ptr[2 * i] = vin1_ptr[i];
                           out_ptr[2 * i + 1] = vin2_ptr[i];
                         }}
                         '''
    else:
        scalar_code = \
        '''for(i = 0; i < step / 2; i++) {{
             out_ptr[i] = vin1_ptr[2 * i];
             out_ptr[step / 2 + i] = vin2_ptr[2 * i];
             out_ptr[step + i] = vin1_ptr[2 * i + 1];
             out_ptr[step + step / 2 + i] = vin2_ptr[2 * i + 1];
           }}
           '''

    if typ == 'f16':
        comp = 'nsimd_f16_to_f32(vout[vi]) !=  nsimd_f16_to_f32(vout_ref[vi])'
    else:
        comp = 'vout[vi] != vout_ref[vi]'

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''{head}

        int main(void){{
          int i, vi, step;
          {typ} *vin1, *vin2;
          {typ} *vout;
          {typ} *vout_ref;

          CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));
          CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));
          CHECK(vout = ({typ} *)nsimd_aligned_alloc(2 * SIZE * {sizeof}));
          CHECK(vout_ref = ({typ} *)nsimd_aligned_alloc(2 * SIZE * {sizeof}));

          step = vlen({typ});

          fprintf(stdout, "test of {op_name} over {typ}...\\n");

          /* Fill input vector(s) with random */
          for (i = 0; i < SIZE; i++)
          {{
            vin1[i] = {rand};
            vin2[i] = {rand};
          }}

          /* Compute a scalar reference version */
          for(vi = 0; vi < SIZE; vi += step)
          {{
            {typ} *out_ptr = vout_ref + 2 * vi;
            {typ} *vin1_ptr = vin1 + vi;
            {typ} *vin2_ptr = vin2 + vi;

            {scalar_code}
          }}

          /* Fill output vector with computed values */
          for (i = 0; i < SIZE; i += step)
          {{
            {vout1_comp}
          }}

          /* Compare results */
          for(vi = 0; vi < SIZE; vi++) {{
            if({comp}) {{
              fprintf(stderr, "test of {op_name} over {typ}... FAIL\\n");
              exit(EXIT_FAILURE);
            }}
          }}

          fprintf(stdout, "test of {op_name} over {typ}... OK\\n");
          fflush(stdout);
          return EXIT_SUCCESS;
        }}
        '''.format(includes=get_includes(lang), op_name=op.name,
                   typ=typ, year=date.today().year,sizeof=common.sizeof(typ),
                   rand=rand, head=head, scalar_code=scalar_code, comp=comp,
                   vout1_comp= vout1_comp, typ_nsimd=typ_nsimd))
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# Entry point

def doit(opts):
    common.myprint(opts, 'Generating tests')
    for op_name, operator in operators.operators.items():
        # Skip non-matching tests
        if opts.match and not opts.match.match(op_name):
            continue
        for typ in operator.types:
            if not should_i_do_the_test(operator, '', typ):
                continue
            elif operator.name == 'nbtrue':
                gen_nbtrue(opts, operator, typ, 'c_base')
                gen_nbtrue(opts, operator, typ, 'c_adv')
                gen_nbtrue(opts, operator, typ, 'cxx_base')
                gen_nbtrue(opts, operator, typ, 'cxx_adv')
            elif operator.name == 'addv':
                if typ in common.ftypes:
                    gen_addv(opts, operator, typ, 'c_base')
                    gen_addv(opts, operator, typ, 'c_adv')
                    gen_addv(opts, operator, typ, 'cxx_base')
                    gen_addv(opts, operator, typ, 'cxx_adv')
            elif operator.name == 'adds':
                gen_adds(opts, operator, typ, 'c_base')
                gen_adds(opts, operator, typ, 'c_adv')
                gen_adds(opts, operator, typ, 'cxx_base')
                gen_adds(opts, operator, typ, 'cxx_adv')
            elif operator.name == 'subs':
                gen_subs(opts, operator, typ, 'c_base')
                gen_subs(opts, operator, typ, 'c_adv')
                gen_subs(opts, operator, typ, 'cxx_base')
                gen_subs(opts, operator, typ, 'cxx_adv')
            elif operator.name in ['all', 'any']:
                gen_all_any(opts, operator, typ, 'c_base')
                gen_all_any(opts, operator, typ, 'c_adv')
                gen_all_any(opts, operator, typ, 'cxx_base')
                gen_all_any(opts, operator, typ, 'cxx_adv')
            elif operator.name == 'iota':
                gen_iota(opts, operator, typ, 'c_base')
                gen_iota(opts, operator, typ, 'c_adv')
                gen_iota(opts, operator, typ, 'cxx_base')
                gen_iota(opts, operator, typ, 'cxx_adv')
            elif operator.name in ['reinterpret', 'reinterpretl', 'cvt',
                                   'upcvt', 'to_mask']:
                for to_typ in common.get_output_types(typ, operator.output_to):
                    if not should_i_do_the_test(operator, to_typ, typ):
                        continue
                    gen_reinterpret_convert(opts, operator, typ, to_typ,
                                            'c_base')
                    gen_reinterpret_convert(opts, operator, typ, to_typ,
                                            'c_adv')
                    gen_reinterpret_convert(opts, operator, typ, to_typ,
                                            'cxx_base')
                    gen_reinterpret_convert(opts, operator, typ, to_typ,
                                            'cxx_adv')
            elif operator.name in ['load2a', 'load2u', 'load3a', 'load3u',
                                   'load4a', 'load4u']:
                gen_load_store(opts, operator, typ, 'c_base')
                gen_load_store(opts, operator, typ, 'c_adv')
                gen_load_store(opts, operator, typ, 'cxx_base')
                gen_load_store(opts, operator, typ, 'cxx_adv')
                gen_load_store_ravel(opts, operator, typ, 'c_base')
            elif operator.name in ['gather', 'gather_linear']:
                gen_gather_scatter(opts, operator, typ, 'c_base')
                gen_gather_scatter(opts, operator, typ, 'c_adv')
                gen_gather_scatter(opts, operator, typ, 'cxx_base')
                gen_gather_scatter(opts, operator, typ, 'cxx_adv')
            elif operator.name == 'mask_scatter':
                gen_mask_scatter(opts, operator, typ, 'c_base')
                gen_mask_scatter(opts, operator, typ, 'c_adv')
                gen_mask_scatter(opts, operator, typ, 'cxx_base')
                gen_mask_scatter(opts, operator, typ, 'cxx_adv')
            elif operator.name in ['maskz_gather', 'masko_gather']:
                gen_maskoz_gather(opts, operator, typ, 'c_base')
                gen_maskoz_gather(opts, operator, typ, 'c_adv')
                gen_maskoz_gather(opts, operator, typ, 'cxx_base')
                gen_maskoz_gather(opts, operator, typ, 'cxx_adv')
            elif operator.name in ['masko_loada1', 'masko_loadu1',
                                   'maskz_loada1', 'maskz_loadu1']:
                gen_mask_load(opts, operator, typ, 'c_base')
                gen_mask_load(opts, operator, typ, 'c_adv')
                gen_mask_load(opts, operator, typ, 'cxx_base')
                gen_mask_load(opts, operator, typ, 'cxx_adv')
            elif operator.name in ['mask_storea1', 'mask_storeu1']:
                gen_mask_store(opts, operator, typ, 'c_base')
                gen_mask_store(opts, operator, typ, 'c_adv')
                gen_mask_store(opts, operator, typ, 'cxx_base')
                gen_mask_store(opts, operator, typ, 'cxx_adv')
            elif operator.name == 'reverse':
                gen_reverse(opts, operator, typ, 'c_base');
                gen_reverse(opts, operator, typ, 'c_adv');
                gen_reverse(opts, operator, typ, 'cxx_base');
                gen_reverse(opts, operator, typ, 'cxx_adv');
            elif operator.name in ['ziplo', 'ziphi',
                                   'unziplo', 'unziphi']:
                gen_unpack_half(opts, operator, typ, 'c_base')
                gen_unpack_half(opts, operator, typ, 'c_adv')
                gen_unpack_half(opts, operator, typ, 'cxx_base')
                gen_unpack_half(opts, operator, typ, 'cxx_adv')
            elif operator.name in ['zip', 'unzip']:
                gen_unpack(opts, operator, typ, 'c_base')
                gen_unpack(opts, operator, typ, 'c_adv')
                gen_unpack(opts, operator, typ, 'cxx_base')
                gen_unpack(opts, operator, typ, 'cxx_adv')
            else:
                gen_test(opts, operator, typ, 'c_base')
                gen_test(opts, operator, typ, 'c_adv')
                gen_test(opts, operator, typ, 'cxx_base')
                gen_test(opts, operator, typ, 'cxx_adv')


================================================
FILE: egg/get_sleef_code.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common
import shutil
import requests
import zipfile
import os

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Copy native Sleef version {}'. \
                         format(opts.sleef_version))

    # First download Sleef
    sleef_dir = os.path.join(opts.script_dir, '..', '_deps-sleef')
    common.mkdir_p(sleef_dir)
    url = 'https://github.com/shibatch/sleef/archive/refs/tags/{}.zip'. \
          format(opts.sleef_version)
    r = requests.get(url, allow_redirects=True)
    sleef_zip = os.path.join(sleef_dir, 'sleef.zip')
    with open(sleef_zip, 'wb') as fout:
        fout.write(r.content)

    # Unzip sleef
    with zipfile.ZipFile(sleef_zip, 'r') as fin:
        fin.extractall(path=sleef_dir)

    # Copy helper function
    def copy(filename):
        dst_filename = os.path.basename(filename)
        shutil.copyfile(os.path.join(sleef_dir,
                                     'sleef-{}'.format(opts.sleef_version),
                                     filename), os.path.join(opts.src_dir,
                                                             dst_filename))

    # Copy files
    copy('src/libm/sleefsimddp.c')
    copy('src/libm/sleefsimdsp.c')
    copy('src/libm/sleefdp.c')
    copy('src/libm/sleefsp.c')
    copy('src/common/misc.h')
    copy('src/libm/estrin.h')
    copy('src/libm/dd.h')
    copy('src/libm/df.h')
    copy('src/libm/rempitab.c')
    copy('src/arch/helpersse2.h')
    copy('src/arch/helperavx.h')
    copy('src/arch/helperavx2.h')
    copy('src/arch/helperavx512f.h')
    copy('src/arch/helperneon32.h')
    copy('src/arch/helperadvsimd.h')
    copy('src/arch/helperpower_128.h')
    copy('src/arch/helpersve.h')

    # Sleef uses aliases but we don't need those so we comment them
    def comment_DALIAS_lines(filename):
        src = os.path.join(opts.src_dir, filename)
        dst = os.path.join(opts.src_dir, 'tmp.c')
        with open(src, 'r') as fin, open(dst, 'w') as fout:
            for line in fin:
                if line.startswith('DALIAS_'):
                    fout.write('/* {} */\n'.format(line.strip()))
                else:
                    fout.write(line)
        shutil.copyfile(dst, src)
        os.remove(dst)
    comment_DALIAS_lines('sleefsimdsp.c')
    comment_DALIAS_lines('sleefsimddp.c')

    # Sleef provides runtime SIMD detection via cpuid but we don't need it
    def replace_x86_cpuid(filename):
        src = os.path.join(opts.src_dir, filename)
        dst = os.path.join(opts.src_dir, 'tmp.c')
        with open(src, 'r') as fin, open(dst, 'w') as fout:
            for line in fin:
                if line.startswith('void Sleef_x86CpuID'):
                    fout.write(
                    '''static inline
                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,
                                           uint32_t ecx) {
                         /* We don't care for cpuid detection */
                         out[0] = 0xFFFFFFFF;
                         out[1] = 0xFFFFFFFF;
                         out[2] = 0xFFFFFFFF;
                         out[3] = 0xFFFFFFFF;
                       }
                       ''')
                else:
                    fout.write(line)
        shutil.copyfile(dst, src)
        os.remove(dst)
    replace_x86_cpuid('helpersse2.h')
    replace_x86_cpuid('helperavx.h')
    replace_x86_cpuid('helperavx2.h')
    replace_x86_cpuid('helperavx512f.h')

    # Sleef uses force inline through its INLINE macro defined in misc.h
    # We modify it to avoid warnings and because force inline has been a pain
    # in the past. We also rename some exported symbols.
    with open(os.path.join(opts.src_dir, 'misc.h'), 'a') as fout:
        fout.write(
        '''

        /* NSIMD specific */
        #ifndef NSIMD_SLEEF_MISC_H
        #define NSIMD_SLEEF_MISC_H

        #ifdef INLINE
        #undef INLINE
        #endif
        #define INLINE inline

        #define Sleef_rempitabdp nsimd_sleef_rempitab_f64
        #define Sleef_rempitabsp nsimd_sleef_rempitab_f32

        #endif

        ''')

    # Sleef functions must be renamed properly for each SIMD extensions.
    # Moreover their name must contain their precision (in ULPs). This
    # precision is not the same for all functions and some functions can have
    # differents flavours (or precisions). The "database" is contained within
    # src/libm/funcproto.h. So we parse it and produce names
    # in headers "rename[SIMD ext].h" to avoid modifying Sleef C files.
    funcproto = os.path.join(sleef_dir, 'sleef-{}'.format(opts.sleef_version),
                             'src', 'libm', 'funcproto.h')
    defines = []
    ulp_suffix = {
        '0' : '',
        '1' : '_u1',
        '2' : '_u05',
        '3' : '_u35',
        '4' : '_u15',
        '5' : '_u3500'
    }
    with open(funcproto, 'r') as fin:
        for line in fin:
            if (line.find('{') != -1 and line.find('}') != -1):
                items = [item.strip() \
                         for item in line.strip(' \n\r{},').split(',')]
                items[0] = items[0].strip('"')
                if items[0] == 'NULL':
                    break
                sleef_name_f64 = items[0] + ulp_suffix[items[2]]
                sleef_name_f32 = items[0] + 'f' + ulp_suffix[items[2]]
                items[1] = items[1] if items[1] != '5' else '05'
                if items[1] == '-1':
                    nsimd_name_f64 = 'nsimd_sleef_{}_{{nsimd_ext}}_f64'. \
                                     format(items[0])
                    nsimd_name_f32 = 'nsimd_sleef_{}_{{nsimd_ext}}_f32'. \
                                     format(items[0])
                else:
                    nsimd_name_f64 = \
                    'nsimd_sleef_{}_u{}{{det}}_{{nsimd_ext}}_f64'. \
                    format(items[0], items[1])
                    nsimd_name_f32 = \
                    'nsimd_sleef_{}_u{}{{det}}_{{nsimd_ext}}_f32'. \
                    format(items[0], items[1])
                defines.append('#define x{} {}'.format(sleef_name_f64,
                                                       nsimd_name_f64))
                defines.append('#define x{} {}'.format(sleef_name_f32,
                                                       nsimd_name_f32))
    defines = '\n'.join(defines)

    sleef_to_nsimd = {
        '':        ['scalar'],
        'sse2':    ['sse2'],
        'sse4':    ['sse42'],
        'avx':     ['avx'],
        'avx2':    ['avx2'],
        'avx512f': ['avx512_knl', 'avx512_skylake'],
        'neon32':  ['neon128'],
        'advsimd': ['aarch64'],
        'sve':     ['sve128', 'sve256', 'sve512', 'sve1024', 'sve2048'],
        'vsx':     ['vmx', 'vsx']
    }

    for simd_ext in ['', 'sse2', 'sse4', 'avx', 'avx2', 'avx512f', 'neon32',
                     'advsimd', 'sve', 'vsx']:
        renameheader = os.path.join(opts.src_dir,
                                    'rename{}.h'.format(simd_ext))
        se = simd_ext if simd_ext != '' else 'scalar'
        with open(renameheader, 'w') as fout:
            fout.write(
            '''#ifndef RENAME{SIMD_EXT}_H
               #define RENAME{SIMD_EXT}_H

               '''.format(SIMD_EXT=se.upper()))
            for nse in sleef_to_nsimd[simd_ext]:
                ifdef = '' if simd_ext == '' \
                           else '#ifdef NSIMD_{}'.format(nse.upper())
                endif = '' if simd_ext == '' else '#endif'
                fout.write(
                '''{hbar}
                   /* Naming of functions {nsimd_ext} */

                   {ifdef}

                   #ifdef DETERMINISTIC

                   {defines_det_f32}

                   #else

                   {defines_nondet_f32}

                   #endif

                   #define rempi nsimd_sleef_rempi_{nsimd_ext}
                   #define rempif nsimd_sleef_rempif_{nsimd_ext}
                   #define rempisub nsimd_sleef_rempisub_{nsimd_ext}
                   #define rempisubf nsimd_sleef_rempisubf_{nsimd_ext}
                   #define gammak nsimd_gammak_{nsimd_ext}
                   #define gammafk nsimd_gammafk_{nsimd_ext}

                   {endif}

                   '''.format(NSIMD_EXT=nse.upper(), nsimd_ext=nse,
                   hbar=common.hbar, ifdef=ifdef, endif=endif,
                   defines_det_f32=defines.format(det='d', nsimd_ext=nse),
                   defines_nondet_f32=defines.format(det='', nsimd_ext=nse),
                   defines_det_f64=defines.format(det='d', nsimd_ext=nse),
                   defines_nondet_f64=defines.format(det='', nsimd_ext=nse)))

            fout.write('\n\n#endif\n\n')

            common.clang_format(opts, renameheader)


================================================
FILE: egg/hatch.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# What does this script?
# ----------------------
#
# This script generates code for each architecture, the base C/C++ APIs and
# the advanced C++ API. Each part to be generated is handled by a
# `gen_*.py` file. This script simply calls the `doit` function of each
# `gen_*.py` module. Names are self-explanatory.
#
# -----------------------------------------------------------------------------
# First thing we do is check whether python3 is used

import sys
if sys.version_info[0] < 3:
    print('Only Python 3 is supported')
    sys.exit(1)

# -----------------------------------------------------------------------------
# Imports

import argparse
import os
import re
import common
import gen_archis
import gen_base_apis
import gen_adv_cxx_api
import gen_adv_c_api
import gen_tests
import gen_src
import gen_doc
import gen_friendly_but_not_optimized
import gen_modules
import gen_scalar_utilities
import get_sleef_code

# Dir of this script
script_dir = os.path.dirname(__file__)
if script_dir == '':
    script_dir = '.'

# -----------------------------------------------------------------------------
# Arguments parsing

def parse_args(args):
    def parse_simd(value):
        ## Split .simd now
        values = {
            'x86': common.x86_simds,
            'arm': common.arm_simds,
            'ppc': common.ppc_simds,
            'all': common.simds,
        }.get(value, value.split(','))
        ## Check that all simd are valid
        ret = []
        for simd in values:
            if simd not in common.simds:
                raise argparse.ArgumentTypeError(
                        "SIMD '{}' not found in {}".format(simd, common.simds))
            ret += common.simds_deps[simd]
        return list(set(ret))
    def parse_match(value):
        if value is None:
            return None
        else:
            return re.compile(value)
    # In pratice, we either generate all or all except tests and we never
    # change default directories for code generation. So we remove unused
    # options and regroup some into --library.
    parser = argparse.ArgumentParser(
                 description='This is NSIMD generation script.')
    parser.add_argument('--force', '-f', action='store_true',
        help='Generate all files even if they already exist')
    parser.add_argument('--list-files', '-L', action='store_true',
        default=False,
        help='List files that will be created by hatch.py')
    parser.add_argument('--all', '-A', action='store_true',
        help='Generate code for the library and its tests')
    parser.add_argument('--library', '-l', action='store_true',
        help='Generate code of the library (C and C++ APIs)')
    parser.add_argument('--sleef', '-s', action='store_true', default=False,
        help='Compile Sleef')
    parser.add_argument('--tests', '-t', action='store_true',
        help='Generate tests in C and C++')
    parser.add_argument('--doc', '-d', action='store_true',
        help='Generate all documentation')
    parser.add_argument('--enable-clang-format', '-F', action='store_false',
        default=True,
        help='Disable Clang Format (mainly for speed on Windows)')
    parser.add_argument('--sve-emulate-bool', action='store_true',
        default=False,
        help='Use normal SVE vector to emulate predicates.')
    parser.add_argument('--simd', '-D', type=parse_simd, default='all',
        help='List of SIMD extensions (separated by a comma)')
    parser.add_argument('--match', '-m', type=parse_match, default=None,
        help='Regex used to filter generation on operator names')
    parser.add_argument('--verbose', '-v', action = 'store_true', default=None,
        help='Enable verbose mode')
    parser.add_argument('--simple-license', action='store_true', default=False,
        help='Put a simple copyright statement instead of the whole license')
    opts = parser.parse_args(args)
    # When -L has been chosen, we want to list all files and so we have to
    # turn to True other parameters
    if opts.list_files:
        opts.library = True
        opts.tests = True
        opts.force = True
        opts.doc = True
    # We set variables here because all the code depends on them + we do want
    # to keep the possibility to change them in the future
    opts.archis = opts.library
    opts.base_apis = opts.library
    opts.adv_cxx_api = opts.library
    opts.adv_c_api = opts.library
    opts.friendly_but_not_optimized = opts.library
    opts.src = opts.library
    opts.scalar_utilities = opts.library
    opts.sleef_version = '3.5.1'
    opts.include_dir = os.path.join(script_dir, '..', 'include', 'nsimd')
    opts.tests_dir = os.path.join(script_dir, '..', 'tests')
    opts.src_dir = os.path.join(script_dir, '..', 'src')
    return opts

# -----------------------------------------------------------------------------
# Entry point

def main():
    opts = parse_args(sys.argv[1:])
    opts.script_dir = script_dir
    opts.modules_list = None
    opts.platforms_list = None

    ## Gather all SIMD dependencies
    opts.simd = common.get_simds_deps_from_opts(opts)
    common.myprint(opts, 'List of SIMD: {}'.format(', '.join(opts.simd)))
    if opts.archis == True or opts.all == True:
        gen_archis.doit(opts)
    if opts.base_apis == True or opts.all == True:
        gen_base_apis.doit(opts)
    if opts.adv_cxx_api == True or opts.all == True:
        gen_adv_cxx_api.doit(opts)
    if opts.adv_c_api == True or opts.all == True:
        gen_adv_c_api.doit(opts)
    if opts.tests == True or opts.all == True:
        gen_tests.doit(opts)
    if opts.src == True or opts.all == True:
        gen_src.doit(opts)
    if opts.sleef == True or opts.all == True:
        get_sleef_code.doit(opts)
    if opts.scalar_utilities == True or opts.all == True:
        gen_scalar_utilities.doit(opts)
    if opts.friendly_but_not_optimized == True or opts.all == True:
        gen_friendly_but_not_optimized.doit(opts)
    gen_modules.doit(opts) # this must be here after all NSIMD
    if opts.doc == True or opts.all == True:
        gen_doc.doit(opts)

if __name__ == '__main__':
    main()


================================================
FILE: egg/modules/fixed_point/gen_doc.py
================================================
# Use utf-8 encoding
# -*- coding: utf-8 -*-

# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import platform
import io
import sys
import subprocess
import collections
import re
import string

import common
import operators

# ------------------------------------------------------------------------------

def gen_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'fixed_point')
    with common.open_utf8(opts, filename) as fout:
        fout.write('''
# NSIMD fixed point module

## Description

This module implements a fixed-point numbers support for the `nsimd` library.
Fixed-point numbers are integer types used to represent decimal numbers. A
number `lf` of bits are used to encode its integer part, and `rt` bits are used
to encode its fractional part.

The fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf,
rt>` to represent a fixed_point number. All the basic floating-point arithmetic
operaors have been defined, therefore fp_t elements can be manipulated as
normal numbers.  The fixed_point module will use a `i8`, `i16`, or
`i32` integer type for storage, depending on the value of `lf + 2 * rt`.

All the functions of the module are under the namespace `nsimd::fixed_point`,
and match the same interface than `nsimd` C++ .

The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd
`fpsimd_t` struct type are defined in `simd.hpp`.

The modules redefines the `nsimd` pack type for fixed-point numbers, templated
with `lf` and `rt` :

```C++
namespace nsimd {
namespace fixed_point {
template <u8 lf, u8 rt>
struct pack;
} // namespace fixed_point
} // namespace nsimd
```

Then, the pack can be manipulated as an `nsimd` pack like other scalar types.

## Compatibility

The fixed point module is a C++ only API, compatible with the C++98 standard.
It has the same compilers and hardware support than the main `nsimd` API
(see the [API index](index.md)).

## Example

Here is a minimal example([main.cpp](../../examples/module_fixed_point.cpp)):

@[INCLUDE_CODE:L21:L61](../../examples/module_fixed_point.cpp)

To test with avx2 run :
```bash
export NSIMD_ROOT=<path/to/nsimd>
g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp
./main
```

The console output will look like this :
```console
$>./main
1.35938 | -0.421875 | 0.9375
1.13281 | 1.19531 | 2.32812
1.64844 | -1.21094 | 0.4375
-0.660156 | 1.07422 | 0.414062
-0.890625 | 0.214844 | -0.675781
-0.0898438 | 0.515625 | 0.425781
-0.539062 | 0.0546875 | -0.484375
1.80859 | 1.66406 | 3.47266
```
        ''')

api_template = '''\
# {full_name}

{desc}

## Template parameter type for T:

When using the following typedef :
```c++
typedef nsimd::fixed_point::fp_t<lf, rt> fp_t
```

The T template parameter is one of the following types depending on the operator:

- `set1`, `loadu` and `loada`:
```c++
nsimd::fixed_point::pack<fp_t>
```
- `loadlu`, `loadla`:
```c++
nsimd::fixed_point::packl<fp_t>
```
- Other operators:
```c++
nsimd::fixed_point::fp_t<lf, rt>
```

## C++ API

```c++
{decl}
```
'''

decl_template = '''\
template <typename T>
{ret}{op}({args});\n\n'''

# -----------------------------------------------------------------------------

def get_type(param, return_typ=False):
    if param == '_':
        return 'void'
    elif param == '*':
        return 'typename T::value_type *'
    elif param == 'c*':
        return 'const typename T::value_type *'
    elif param == 's':
        return 'typename T::value_type'
    elif param in 'v':
        return 'pack<T>' if return_typ else 'const pack<T> &'
    elif param == 'l':
        return 'packl<T>' if return_typ else 'const packl<T> &'
    elif param == 'p':
        return 'int '
    else:
        return None

# -----------------------------------------------------------------------------

def gen_decl(op):
    sig = '{}{} {{}}({});'.format(
            'template <typename T> ' \
                if 'v' not in op.params[1:] and \
                   'l' not in op.params[1:] else '',
            get_type(op.params[0], True),
            ', '.join(['{} {}'.format(
                               get_type(op.params[i + 1]),
                                        common.get_arg(i)) \
                                        for i in range(len(op.params[1:]))])
          )
    ret = 'namespace nsimd {\n' \
          'namespace fixed_point {\n\n' + sig.format(op.name) + '\n\n'
    if op.cxx_operator != None:
        ret += sig.format('operator' + op.cxx_operator) + '\n\n'
    ret += '} // namespace fixed_point\n' \
           '} // namespace nsimd'
    return ret

# -----------------------------------------------------------------------------

def gen_api(opts, op_list):
    api = dict()
    for _, operator in operators.operators.items():
        if operator.name not in op_list:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    filename = common.get_markdown_file(opts, 'api', 'fixed_point')
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# NSIMD fixed point API\n''')
        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- [{} ({})](module_fixed_point_api_{}.md)\n'. \
                           format(op.full_name, op.name,
                                  common.to_filename(op.name)))

# -----------------------------------------------------------------------------

def gen_doc(opts, op_list):
    for _, op in operators.operators.items():
        if op.name not in op_list:
            continue
        filename = common.get_markdown_api_file(opts, op.name, 'fixed_point')
        with common.open_utf8(opts, filename) as fout:
            fout.write(api_template.format(full_name=op.full_name,
                                           desc=op.desc, decl=gen_decl(op)))

# -----------------------------------------------------------------------------

def doit(opts, op_list):
    common.myprint(opts, 'Generating doc for module fixed_point')
    gen_overview(opts)
    gen_api(opts, op_list)
    gen_doc(opts, op_list)


================================================
FILE: egg/modules/fixed_point/gen_tests.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import sys
import common

# -------------------------------------------------------------------------------

def get_filename(opts, op, lf, rt):
    tests_dir = os.path.join(opts.tests_dir, "modules/fixed_point")
    common.mkdir_p(tests_dir)
    filename = os.path.join(tests_dir, '{}.fp_{}_{}.cpp'.format(op, lf, rt))
    if os.path.exists(filename):
        os.remove(filename)
    if common.can_create_filename(opts, filename):
        return filename
    else:
        return None

includes = """
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <time.h>

#include <nsimd/nsimd.h>
#include <nsimd/modules/fixed_point.hpp>
"""

arithmetic_aliases = """
typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
typedef nsimd::fixed_point::pack<fp_t> vec_t;
typedef nsimd::fixed_point::packl<fp_t> vecl_t;
typedef nsimd::fixed_point::pack<fp_t>::value_type raw_t;
typedef nsimd::fixed_point::packl<fp_t>::value_type log_t;
const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());
"""

# ------------------------------------------------------------------------------
# Utility functions

check = """
#define CHECK(a) {{ \\
  if (!(a)) {{ \\
    fprintf(stderr, "ERROR: " #a ":%s: %d\\n", __FILE__, __LINE__); \\
    fflush(stderr); \\
    exit(EXIT_FAILURE); \\
  }} \\
}}

"""

limits = """
template <u8 lf, u8 rt>
static double __get_numeric_precision() {
  return (double)ldexpf(1.0, -(int)rt);
}

"""

comparison_fp = """
template <u8 lf, u8 rt>
bool __compare_values(nsimd::fixed_point::fp_t<lf, rt> val, double ref){
  return nsimd_scalar_abs_f64(double(val) - ref) <=
           __get_numeric_precision<lf, rt>();
}

"""

comparison_log = """
template <typename T, u8 lf, u8 rt>
bool __check_logical_val(T val, nsimd::fixed_point::fp_t<lf, rt> v0,
    nsimd::fixed_point::fp_t<lf, rt> v1)
{{
  return (((v0._raw {op_val} v1._raw) && (val != 0))
      || (!(v0._raw {op_val} v1._raw) && (val == 0)));
}}

"""

gen_random_val = """
template <u8 lf, u8 rt>
nsimd::fixed_point::fp_t<lf, rt> __gen_random_val() {{
  float tmp = (float) rand() / (float) RAND_MAX;
  return nsimd::fixed_point::fp_t<lf, rt>(0.5f * tmp + 1.0f);
}}

"""

# ------------------------------------------------------------------------------
# Template for arithmetic binary operators

arithmetic_test_template = """
{includes}

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> vec_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  // FP vectors
  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));

  // Floating point equivalent
  double *tab0_f = (double *) malloc(v_size * sizeof(double));
  double *tab1_f = (double *) malloc(v_size * sizeof(double));
  double *res_f  = (double *) malloc(v_size * sizeof(double));

  for (size_t i = 0; i < v_size; i++) {{
    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab0_f[i] = double(tab0_fp[i]);
    tab1_f[i] = double(tab1_fp[i]);
  }}

  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);
  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);
  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp);
  nsimd::fixed_point::storeu(res_fp, vres_fp);

  for (size_t i = 0; i < v_size; i++) {{
    res_f[i] = tab0_f[i] {op_val} tab1_f[i];
  }}

  for(size_t i = 0; i < v_size; i++) {{
    CHECK(__compare_values(res_fp[i], res_f[i]));
  }}

  fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

arithmetic_ops = [("add", "+"), ("sub", "-"), ("mul", "*"), ("div","/")]

def gen_arithmetic_ops_tests(lf, rt, opts):
    for op_name, op_val in arithmetic_ops:
        decls = check + limits + comparison_fp + gen_random_val
        content_src = arithmetic_test_template.format(
            op_name=op_name, op_val=op_val, lf=lf, rt=rt,
            includes=includes, decls=decls)
        filename = get_filename(opts, op_name, lf, rt)
        if filename == None:
            continue
        with common.open_utf8(opts, filename) as fp:
            fp.write(content_src)
        common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Min max operators template

minmax_test_template = """
{includes}
#define op_min(a, b) ((a) < (b) ?(a) : (b))
#define op_max(a, b) ((a) > (b) ?(a) : (b))

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> vec_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  // FP vectors
  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));

  int *res_ref  = (int *) malloc(v_size * sizeof(int));

  for (size_t i = 0; i < v_size; i++) {{
    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();
  }}

  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);
  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);
  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp);
  nsimd::fixed_point::storeu(res_fp, vres_fp);

  for (size_t i = 0; i < v_size; i++) {{
    res_ref[i] = op_{op_name}((int) tab0_fp[i]._raw, (int) tab1_fp[i]._raw);
  }}

  for(size_t i = 0; i < v_size; i++) {{
    CHECK(res_fp[i]._raw == res_ref[i]);
  }}

  fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

minmax_ops = ["min", "max"]
def gen_minmax_ops_tests(lf, rt, opts):
    for op_name in minmax_ops:
        decls = check + limits + comparison_fp + gen_random_val
        content_src = minmax_test_template.format(
            op_name=op_name, lf=lf, rt=rt,
            includes=includes, decls=decls)
        filename = get_filename(opts, op_name, lf, rt)
        if filename == None:
            continue
        with common.open_utf8(opts, filename) as fp:
            fp.write(content_src)
        common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Ternary ops (FMA and co)

ternary_ops_template = """
{includes}

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> vec_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  // FP vectors
  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *tab2_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));

  // Floating point equivalent
  double *tab0_f = (double *) malloc(v_size * sizeof(double));
  double *tab1_f = (double *) malloc(v_size * sizeof(double));
  double *tab2_f = (double *) malloc(v_size * sizeof(double));
  double *res_f  = (double *) malloc(v_size * sizeof(double));

  for (size_t i = 0; i < v_size; i++) {{
    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab2_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab0_f[i] = double(tab0_fp[i]);
    tab1_f[i] = double(tab1_fp[i]);
    tab2_f[i] = double(tab2_fp[i]);
  }}

  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);
  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);
  vec_t v2_fp = nsimd::fixed_point::loadu<vec_t>(tab2_fp);
  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp, v2_fp);
  nsimd::fixed_point::storeu(res_fp, vres_fp);

  for(size_t i = 0; i < v_size; i++) {{
    const double a = tab0_f[i];
    const double b = tab1_f[i];
    const double c = tab2_f[i];

    {check_statement}
  }}

  for(size_t i = 0; i < v_size; i++) {{
    CHECK(__compare_values(res_fp[i], res_f[i]));
  }}

  fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

ternary_ops = [("fma", "res_f[i] = (a * b) + c;")]
def gen_ternary_ops_tests(lf, rt, opts):
    for op_name, statement in ternary_ops:
        decls = check + limits + comparison_fp + gen_random_val
        content_src = ternary_ops_template.format(
            op_name=op_name, check_statement=statement.format(lf=lf, rt=rt),
            lf=lf, rt=rt,includes=includes, decls=decls)
        filename = get_filename(opts, op_name, lf, rt)
        if filename == None:
            continue
        with common.open_utf8(opts, filename) as fp:
            fp.write(content_src)
        common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Template for math operators

rec_reference = """
// Rec operator on floating points (avoids to write a particular test for rec)
static inline double rec(const double x) {{ return 1.0 / x; }}
"""

math_test_template = """
{includes}

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> vec_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  // FP vectors
  fp_t *tab0_fp= (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t));

  // Floating point equivalent
  double *tab0_f = (double *) malloc(v_size * sizeof(double));
  double *res_f  = (double *) malloc(v_size * sizeof(double));

  for (size_t i = 0; i < v_size; i++) {{
    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab0_f[i] = double(tab0_fp[i]);
  }}

  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);
  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp);
  nsimd::fixed_point::storeu(res_fp, vres_fp);

  for (size_t i = 0; i < v_size; i++) {{
    res_f[i] = {ref_op_name}(tab0_f[i]);
  }}

  for(size_t i = 0; i < v_size; i++) {{
    CHECK(__compare_values(res_fp[i], res_f[i]));
  }}

  fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

math_ops = ["rec", "abs"]
def gen_math_functions_tests(lf, rt, opts):
    for op_name in math_ops:
        decls = check + limits + comparison_fp + gen_random_val
        if op_name == "rec":
            decls += rec_reference
            ref_op_name = 'rec'
        else:
            ref_op_name = 'nsimd_scalar_abs_f64'
        content_src = math_test_template.format(op_name=op_name, lf=lf, rt=rt,
                                                ref_op_name=ref_op_name,
                                                includes=includes, decls=decls)
        filename = get_filename(opts, op_name, lf, rt)
        if filename == None:
            continue
        with common.open_utf8(opts, filename) as fp:
            fp.write(content_src)
        common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Comparison operators

comparison_test_template = """
{includes}

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main(){{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> vec_t;
  typedef nsimd::fixed_point::packl<fp_t> vecl_t;
  typedef nsimd::fixed_point::packl<fp_t>::value_type log_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  // FP vectors
  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  log_t *resl_fp = (log_t *) malloc(v_size * sizeof(log_t));

  for(size_t i = 0; i < v_size; i++) {{
    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();
  }}
  // Be sure there is at least one equality to test all the cases.
  tab0_fp[0] = tab1_fp[0];

  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);
  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);
  vecl_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp);
  nsimd::fixed_point::storelu(resl_fp, vres_fp);

  for(size_t i = 0; i < v_size; i++) {{
    CHECK((__check_logical_val<log_t, {lf}, {rt}>(
        resl_fp[i], tab0_fp[i], tab1_fp[i])));
  }}

  fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

comparison_ops = [("eq","=="), ("ne","!="), ("le","<="), ("lt","<"),
                  ("ge",">="), ("gt",">")]

def gen_comparison_tests(lf, rt, opts):
    for op_name, op_val in comparison_ops:
        decls = check + limits + comparison_log.format(op_val=op_val) + gen_random_val
        content_src = comparison_test_template.format(
            op_name=op_name, op_val=op_val, lf=lf, rt=rt,
            includes=includes, decls=decls)
        filename = get_filename(opts, op_name, lf, rt)
        if filename == None:
            continue
        with common.open_utf8(opts, filename) as fp:
            fp.write(content_src)
        common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Bitwise binary operators

bitwise_binary_test_template = """
{includes}
#include <limits>

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack{l}<fp_t> vec{l}_t;
  typedef nsimd::fixed_point::pack{l}<fp_t>::value_type raw_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  raw_t *tab0 = (raw_t *) malloc(v_size * sizeof(raw_t));
  raw_t *tab1 = (raw_t *) malloc(v_size * sizeof(raw_t));
  raw_t *res  = (raw_t *) malloc(v_size * sizeof(raw_t));

  for(size_t i = 0; i < v_size; i++)
  {{
    tab0[i] = {rand_statement}
    tab1[i] = {rand_statement}
  }}
  // Be sure there is at least one equality to test all the cases.
  tab0[0] = tab1[0];

  vec{l}_t v0 = nsimd::fixed_point::load{l}u<vec{l}_t>(tab0);
  vec{l}_t v1 = nsimd::fixed_point::load{l}u<vec{l}_t>(tab1);
  vec{l}_t v_res = nsimd::fixed_point::{op_name}{term}(v0, v1);
  nsimd::fixed_point::store{l}u(res, v_res);

  for(size_t i = 0; i < v_size; i++)
  {{
    raw_t a = tab0[i];
    raw_t b = tab1[i];
    raw_t c = res[i];
    CHECK({test_statement});
  }}

  fprintf(stdout, \"test of {op_name}{term} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

bitwise_binary_ops = [("and", "c._raw == (a._raw & b._raw)", "c == (a & b)"),
                      ("andnot", "c._raw == (a._raw & ~b._raw)", "c == (a & ~b)"),
                      ("or", "c._raw == (a._raw | b._raw)", "c == (a | b)"),
                      ("xor","c._raw == ((~a._raw & b._raw) | (a._raw & ~b._raw))",
                       "c == ((~a & b) | (a & ~b))")]
def gen_bitwise_ops_tests(lf, rt, opts):
    for op_name, s0, s1 in bitwise_binary_ops:
        # {op}b
        decls = check + limits + gen_random_val
        content_src = bitwise_binary_test_template.format(
            op_name=op_name, lf=lf, rt=rt,
            includes=includes, decls=decls,
            rand_statement="__gen_random_val<{lf}, {rt}>();".format(lf=lf, rt=rt),
            test_statement=s0, l="", term="b")
        filename = get_filename(opts, op_name + "b", lf, rt)
        if filename != None:
            with common.open_utf8(opts, filename) as fp:
                fp.write(content_src)
            common.clang_format(opts, filename)

        # {op}l
        content_src = bitwise_binary_test_template.format(
            op_name=op_name, lf=lf, rt=rt,
            includes=includes, decls=decls,
            rand_statement="(raw_t)(rand() % 2);".format(lf=lf, rt=rt),
            test_statement=s1, l="l", term="l")
        filename = get_filename(opts, op_name + "l", lf, rt)
        if filename != None:
            with common.open_utf8(opts, filename) as fp:
                fp.write(content_src)
            common.clang_format(opts, filename)

# ------------------------------------------------------------------------------
# Bitwise unary operators

bitwise_unary_test_template = """
{includes}

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack{l}<fp_t> vec{l}_t;
  typedef nsimd::fixed_point::pack{l}<fp_t>::value_type raw_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  raw_t *tab0 = (raw_t *) malloc(v_size * sizeof(raw_t));;
  raw_t *res  = (raw_t *) malloc(v_size * sizeof(raw_t));;

  for(size_t i = 0; i < v_size; i++)
  {{
    tab0[i] = {rand_statement}
  }}

  vec{l}_t v0 = nsimd::fixed_point::load{l}u<vec{l}_t>(tab0);
  vec{l}_t v_res = nsimd::fixed_point::{op_name}{term}(v0);
  nsimd::fixed_point::store{l}u(res, v_res);

  for(size_t i = 0; i < v_size; i++)
  {{
    raw_t a = tab0[i];
    raw_t b = res[i];
    CHECK({test_statement});
  }}

  fprintf(stdout, \"test of {op_name}{term} over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

bitwise_unary_ops = [("not", "b._raw == ~a._raw",
                      "((b == 0) && (a == 1)) | ((b == 1) && (a == 0))")]
def gen_unary_ops_tests(lf, rt, opts):
    for op_name, s0, s1 in bitwise_unary_ops:
        decls = check + limits + gen_random_val
        # {op}b
        content_src = bitwise_unary_test_template.format(
            op_name=op_name, lf=lf, rt=rt,
            includes=includes, decls=decls,
            rand_statement="__gen_random_val<{lf}, {rt}>();".format(lf=lf,
            rt=rt), test_statement=s0, l="", term="b")
        filename = get_filename(opts, op_name + "b", lf, rt)
        if filename != None:
            with common.open_utf8(opts, filename) as fp:
                fp.write(content_src)
            common.clang_format(opts, filename)

        # {op}l
        content_src = bitwise_unary_test_template.format(
            op_name=op_name, lf=lf, rt=rt,
            includes=includes, decls=decls,
            rand_statement="(raw_t)(rand() % 2);".format(lf=lf, rt=rt),
            test_statement=s1, l="l", term="l")
        filename = get_filename(opts, op_name + "l", lf, rt)
        if filename != None:
            with common.open_utf8(opts, filename) as fp:
                fp.write(content_src)
            common.clang_format(opts, filename)

# -----------------------------------------------------------------------------
# if_else

if_else_test_template = """
{includes}

// -----------------------------------------------------------------------------

{decls}

// -----------------------------------------------------------------------------

int main() {{
  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> vec_t;
  typedef nsimd::fixed_point::packl<fp_t> vecl_t;
  typedef nsimd::fixed_point::packl<fp_t>::value_type log_t;
  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());

  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));
  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));
  log_t *mask = (log_t *) malloc(v_size * sizeof(log_t));

  for(size_t i = 0; i < v_size; i++) {{
    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();
    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();
    mask[i] = (log_t) (rand() % 2);
  }}

  vec_t v0 = nsimd::fixed_point::loadu<vec_t>(tab0_fp);
  vec_t v1 = nsimd::fixed_point::loadu<vec_t>(tab1_fp);
  vecl_t vl = nsimd::fixed_point::loadlu<vecl_t>(mask);
  vec_t v_res = nsimd::fixed_point::if_else1(vl, v0, v1);
  nsimd::fixed_point::storeu(res_fp, v_res);

  for(size_t i = 0; i < v_size; i++)
  {{
    fp_t ref = mask[i] ? tab0_fp[i] : tab1_fp[i];
    CHECK(ref._raw == res_fp[i]._raw);
  }}

  fprintf(stdout, \"test of if_else1 over fp_t<{lf},{rt}>... OK\\n\");
  return EXIT_SUCCESS;
}}
"""

def gen_if_else_tests(lf, rt, opts):
    decls = check + limits + comparison_fp + gen_random_val
    content_src = if_else_test_template.format(
        lf=lf, rt=rt, includes=includes, decls=decls)
    filename = get_filename(opts, "if_else", lf, rt)
    if filename == None:
        return
    with common.open_utf8(opts, filename) as fp:
        fp.write(content_src)
    common.clang_format(opts, filename)
# -------------------------------------------------------------------------------

load_ops = ["loadu", "loadlu", "loada", "loadla"]
store_ops = ["storeu", "storelu", "storea", "storela"]

# -------------------------------------------------------------------------------
# Entry point

lf_vals = ["4", "8", "16"]
rt_vals = ["1", "2", "3", "4", "5", "6", "7", "8"]

def doit(opts):
    common.myprint(opts, 'Generating tests for module fixed_point')
    for lf in lf_vals:
        for rt in rt_vals:
            ## Arithmetic operators
            gen_arithmetic_ops_tests(lf, rt, opts)

            ## Min and max operators
            gen_minmax_ops_tests(lf, rt, opts)

            ## Ternary_operators
            gen_ternary_ops_tests(lf, rt, opts)

            ## Math functions
            gen_math_functions_tests(lf, rt, opts)

            ## Comparison operators
            gen_comparison_tests(lf, rt, opts)

            ## Bitwise binary operators
            gen_bitwise_ops_tests(lf, rt, opts)

            ## Bitwise unary operators
            gen_unary_ops_tests(lf, rt, opts)

            ## If_else
            gen_if_else_tests(lf, rt, opts)


================================================
FILE: egg/modules/fixed_point/hatch.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

## -----------------------------------------------------------------------------

op_list = [
    'len',
    'set1',
    'loadu',
    'loada',
    'loadlu',
    'loadla',
    'storeu',
    'storea',
    'storelu',
    'storela',
    'add',
    'sub',
    'mul',
    'div',
    'fma',
    'min',
    'max',
    'abs',
    'rec',
    'eq',
    'ne',
    'le',
    'lt',
    'ge',
    'gt',
    'ifelse1',
    'andb',
    'andnotb',
    'notb',
    'orb',
    'xorb',
    'andl',
    'andnotl',
    'notl',
    'orl',
    'xorl'
]

# -----------------------------------------------------------------------------
# Imports

import modules.fixed_point.gen_tests
import modules.fixed_point.gen_doc

# -----------------------------------------------------------------------------

def name():
    return 'Fixed-point arithmetic'

def desc():
    return '''This module provides vectorized fixed-point arithmetic through
a C++98 API. The programmer can choose the integral type and the place of the
coma for representing its fixed-point numbers. A number of operators are
also provided.'''

def doc_menu():
    return {'Overview': 'overview', 'API reference': 'api'}

# -----------------------------------------------------------------------------
# Entry point

def doit(opts):
    if opts.tests == True or opts.all == True:
        modules.fixed_point.gen_tests.doit(opts)
    if opts.doc == True or opts.all == True:
        modules.fixed_point.gen_doc.doit(opts, op_list)


================================================
FILE: egg/modules/memory_management/hatch.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common

# -----------------------------------------------------------------------------

def name():
    return 'Memory management'

def desc():
    return '''This module provides C-style memory managmenent functions:
malloc, calloc, free, copy to/from devices, etc... Its purpose is to facilitate
the use of data buffers in a portable way for systems with CPUs only and
for systems with CPUs and GPUs.'''

def doc_menu():
    return dict()

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Generating module memory_management')
    if not opts.doc:
        return
    filename = common.get_markdown_file(opts, 'overview', 'memory_management')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

This module provides C-style memory managmenent functions. Its purpose is not
to become a fully feature container library. It is to provide portable
malloc, memcpy and free functions with a little helpers to copy data from and
to the devices.

# API reference

## Equivalents of malloc, calloc, memcpy and free for devices

Note that the below functions simply wraps the corresponding C functions
when targeting a CPU.

- `template <typename T> T *device_malloc(size_t sz)`{br}
  Allocates `sz * sizeof(T)` bytes of memory on the device.
  On error NULL is returned.

- `template <typename T> T *device_calloc(size_t sz)`{br}
  Allocates `sz * sizeof(T)` bytes of memory on the device and set the
  allocated memory to zero.
  On error NULL is returned.

- `template <typename T> void device_free(T *ptr)`{br}
  Free the memory pointed to by the given pointer.

- `template <typename T> void copy_to_device(T *device_ptr, T *host_ptr,
  size_t sz)`{br}
  Copy data to from host to device.

- `template <typename T> void copy_to_host(T *host_ptr, T *device_ptr,
  size_t sz)`{br}
  Copy data to from device to host.

- `#define nsimd_fill_dev_mem_func(func_name, expr)`{br}
  Create a device function that will fill data with `expr`. To call the created
  function one simply does `func_name(ptr, sz)`. The `expr` argument represents
  some simple C++ expression that can depend only on `i` the i-th element in
  the vector as shown in the example below.

  ```c++
  nsimd_fill_dev_mem_func(prng, ((i * 1103515245 + 12345) / 65536) % 32768)

  int main() {{
    prng(ptr, 1000);
    return 0;
  }}
  ```

## Pairs of pointers

It is often useful to allocate a pair of data buffers: one on the host and
one on the devices to perform data transfers. The below functions provides
quick ways to malloc, calloc, free and memcpy pointers on host and devices at
once. Note that when targeting CPUs the pair of pointers is reduced to one
pointer that ponit the a single data buffer in which case memcpy's are not
performed. Note also that there is no implicit synchronization of data
between both data buffers. It is up to the programmer to triggers memcpy's.

```c++
template <typename T>
struct paired_pointers_t {{
  T *device_ptr, *host_ptr;
  size_t sz;
}};
```

Members of the above structure are not to be modified but can be passed as
arguments for reading/writing data from/to memory they point to.

- `template <typename T> paired_pointers_t<T> pair_malloc(size_t sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  If an error occurs both pointers are NULL.

- `template <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t
  sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  If an error occurs, prints an error message on stderr and exit(3).

- `template <typename T> paired_pointers_t<T> pair_calloc(size_t sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  Write both data buffers with zeros.
  If an error occurs both pointers are NULL.

- `template <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t
  sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  Write both data buffers with zeros.
  If an error occurs, prints an error message on stderr and exit(3).

- `template <typename T> void pair_free(paired_pointers_t<T> p)`{br}
  Free data buffers on the host and the device.

- `template <typename T> void copy_to_device(paired_pointers_t<T> p)`{br}
  Copy data from the host buffer to its corresponding device buffer.

- `template <typename T> void copy_to_host(paired_pointers_t<T> p)`{br}
  Copy data from the device buffer to its corresponding host buffer.
'''.format(br='  '))


================================================
FILE: egg/modules/random/hatch.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import common
import collections


# -----------------------------------------------------------------------------

rand_functions = list()


class MAddToRands(type):
    def __new__(cls, name, bases, dct):
        ret = type.__new__(cls, name, bases, dct)
        if name != 'Rand':
            rand_functions.append(ret())
        return ret

class Rand(object, metaclass=MAddToRands):
    def gen_function_name(self, nwords, word_size, nrounds):
        return '{}_{}x{}_{}'.format(self.name, nwords, word_size, nrounds)

    def gen_headers(self, opts):
        res = ''

        for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    res += self.gen_signature(nwords, word_size, nrounds)+';'

        return res

    def gen_tests(self, opts, nrounds, word_size, nwords):

        key_size = self.get_key_size(nwords)

        key_initialization = 'nsimd::packx{}<u{}> key_pack;'. \
                format(key_size, word_size)
        for i in range (0, key_size):
            key_initialization += '''
            i = {i};
            for (int j = 0; j < len; j++) {{
              key[j + i * len] = (u{word_size})(j + i * len);
            }}
            key_pack.v{i} = nsimd::loadu(&key[i*len], u{word_size}());
            '''.format(i=i, word_size=word_size)

        input_initilization = \
                'memset(in, 0, sizeof(u{}) * {} * ulen);\n'. \
                format(word_size, nwords)
        for i in range (0, nwords):
            input_initilization += 'in_pack.v{} = nsimd::pack<u{}>(0);'. \
                    format(i, word_size)

        compare = ''
        for i in range (0, nwords):
            compare += '''
                if (i=={i}) {{
                    nsimd::storeu(out_nsimd, out_pack.v{i});
                }}
                '''.format(i=i)

        l = 'll' if word_size == 64 else ''
        cast = '(nsimd_ulonglong)' if word_size == 64 else ''

        res = '''
        #include <nsimd/modules/random/functions.hpp>
        #include "reference.hpp"
        #include <iostream>

        #ifdef NSIMD_LONGLONG_IS_EXTENSION
          #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
            #pragma GCC diagnostic ignored "-Wformat"
          #endif
        #endif

        int main() {{
          int res = EXIT_SUCCESS;
          printf("Test of {function_name} ...\\n");

          nsimd::packx{nwords}<u{word_size}> in_pack;
          nsimd::packx{nwords}<u{word_size}> out_pack;

          const int len = nsimd::len(u{word_size}());
          const unsigned int ulen = (unsigned int)len;

          u{word_size} *key = (u{word_size}*)malloc(ulen *
                                sizeof(u{word_size}) * {key_size});
          u{word_size} *in = (u{word_size}*)malloc(ulen *
                               sizeof(u{word_size}) * {nwords});
          u{word_size} *out = (u{word_size}*)malloc(ulen *
                                sizeof(u{word_size}) * {nwords});
          u{word_size} *out_nsimd = (u{word_size}*)malloc(ulen *
                                      sizeof(u{word_size}));

          tab{word_size}x{nwords}_t in_ref;
          tab{word_size}x{key_size}_t key_ref;
          tab{word_size}x{nwords}_t out_ref;

          int i;

          // Keys
          {key_initialization}

          {input_initilization}

          for (int cpt=0; cpt < 100000; ++cpt) {{
            out_pack = nsimd::random::{function_name}(in_pack, key_pack);

            for (int i=0; i<len; ++i) {{
              for (int j=0; j<{nwords}; ++j) {{
                  in_ref.v[j] = in[i + j * len];
              }}

              for (int j=0; j<{key_size}; ++j) {{
                  key_ref.v[j] = key[i + j*len];
              }}

              out_ref = branson_{name}{nwords}x{word_size}_R({nrounds},
                          in_ref, key_ref);

              for (int j=0; j<{nwords}; ++j) {{
                  out[i + j * len] = out_ref.v[j];
              }}
            }}

            for (int i=0; i<{nwords}; ++i) {{
              {compare}

              if (memcmp(out_nsimd, &out[i * len],
                         ulen * sizeof(u{word_size}))) {{
                printf ("%i\\n", i);
                for (int j=0; j<len; ++j) {{
                  printf ("%{l}u\\t(0x%{l}x)\\t\\t%{l}u\\t(0x%{l}x)\\n",
                          {cast}out[j+i*len], {cast}out[j+i*len],
                          {cast}out_nsimd[j], {cast}out_nsimd[j]);
                }}

                res = EXIT_FAILURE;
                printf("... FAILED\\n");
                goto cleanup;
              }}
            }}

            in_pack = out_pack;
            memcpy(in, out, sizeof(u{word_size}) * {nwords} * ulen);
          }}

          fprintf(stdout, "... OK\\n");

        cleanup:
          free(key);
          free(in);
          free(out);
          free(out_nsimd);

          return res;
        }}
        '''.format(function_name=self.gen_function_name(nwords, word_size,
                   nrounds), word_size=word_size, key_size=key_size,
                   nwords=nwords, key_initialization=key_initialization,
                   nrounds=nrounds, input_initilization=input_initilization,
                   compare=compare, l=l, name = self.name, cast=cast)

        # Write file
        return res

class Philox(Rand):
    name = 'philox'

    wordsize_nwords_nrounds = {32: {2: [10],
                                    4: [7, 10]},
                               64: {2: [6, 10],
                                    4: [7, 10]}}

    mullohi='''
#if 1
void mulhilo32(pack<u32> a, pack<u32> b, pack<u32> *low, pack<u32> *high) {
  nsimd::packx2<u64> a64 = nsimd::upcvt(nsimd::packx2<u64>(), a);
  nsimd::packx2<u64> b64 = nsimd::upcvt(nsimd::packx2<u64>(), b);

  nsimd::packx2<u64> product;
  product.v0 = a64.v0 * b64.v0;
  product.v1 = a64.v1 * b64.v1;

  *high =
      nsimd::downcvt(nsimd::pack<u32>(), product.v0 >> 32, product.v1 >> 32);
  *low = nsimd::downcvt(nsimd::pack<u32>(), product.v0, product.v1);
}

#else

void mulhilo32(pack<u32> a, pack<u32> b, pack<u32> *low, pack<u32> *high) {
  nsimd::pack<u32> ah = nsimd::shr(a, 16);
  nsimd::pack<u32> bh = nsimd::shr(b, 16);

  nsimd::pack<u32> al = nsimd::shr(nsimd::shl(a, 16), 16);
  nsimd::pack<u32> bl = nsimd::shr(nsimd::shl(b, 16), 16);

  nsimd::pack<u32> ahbh = ah * bh;
  nsimd::pack<u32> ahbl = ah * bl;
  nsimd::pack<u32> albh = al * bh;
  nsimd::pack<u32> albl = al * bl;

  nsimd::pack<u32> tmp1 = nsimd::shl(albh, 16);
  nsimd::pack<u32> tmp2 = nsimd::shl(ahbl, 16);

  nsimd::pack<u32> tmp3 = tmp1 + tmp2;

  nsimd::pack<u32> _1 = nsimd::set1(nsimd::pack<u32>(), 1u);
  nsimd::pack<u32> _0 = nsimd::set1(nsimd::pack<u32>(), 0u);

  nsimd::pack<u32> carry =
      nsimd::if_else1((tmp3 < tmp1) || (tmp3 < tmp2), _1, _0);

  *low = tmp3 + albl;

  carry = carry + nsimd::if_else1((*low < tmp3) || (*low < albl), _1, _0);

  *high = ahbh + nsimd::shr(albh, 16) + nsimd::shr(ahbl, 16) + carry;
}
#endif

#if 0
void mulhilo64(pack<u64> a, pack<u64> b, pack<u64> *low, pack<u64> *high) {
  u64 a_buf[8];
  u64 b_buf[8];
  u64 low_buf[8];
  u64 high_buf[8];

  nsimd::storeu(a_buf, a);
  nsimd::storeu(b_buf, b);

  for (int i = 0; i < nsimd::len(u64()); ++i) {
    __uint128_t product = ((__uint128_t)a_buf[i]) * ((__uint128_t)b_buf[i]);
    high_buf[i] = (u64)(product >> 64);
    low_buf[i] = (u64)product;
  }

  *high = nsimd::loadu(high_buf, u64());
  *low = nsimd::loadu(low_buf, u64());
}

#else

void mulhilo64(pack<u64> a, pack<u64> b, pack<u64> *low, pack<u64> *high) {
  nsimd::pack<u64> ah = nsimd::shr(a, 32);
  nsimd::pack<u64> bh = nsimd::shr(b, 32);

  nsimd::pack<u64> al = nsimd::shr(nsimd::shl(a, 32), 32);
  nsimd::pack<u64> bl = nsimd::shr(nsimd::shl(b, 32), 32);

  nsimd::pack<u64> ahbh = ah * bh;
  nsimd::pack<u64> ahbl = ah * bl;
  nsimd::pack<u64> albh = al * bh;
  nsimd::pack<u64> albl = al * bl;

  nsimd::pack<u64> tmp1 = nsimd::shl(albh, 32);
  nsimd::pack<u64> tmp2 = nsimd::shl(ahbl, 32);

  nsimd::pack<u64> tmp3 = tmp1 + tmp2;

  nsimd::pack<u64> _1 = nsimd::set1(nsimd::pack<u64>(), (u64)1);
  nsimd::pack<u64> _0 = nsimd::set1(nsimd::pack<u64>(), (u64)0);

  nsimd::pack<u64> carry =
      nsimd::if_else1((tmp3 < tmp1) || (tmp3 < tmp2), _1, _0);

  *low = tmp3 + albl;

  carry = carry + nsimd::if_else1((*low < tmp3) || (*low < albl), _1, _0);

  *high = ahbh + nsimd::shr(albh, 32) + nsimd::shr(ahbl, 32) + carry;
}
#endif
    '''

    def gen_signature(self, nwords, word_size, nrounds):
        return ('nsimd::packx{nwords}<u{word_size}> {fun_name}' \
                '(nsimd::packx{nwords}<u{word_size}> in, ' \
                'nsimd::packx{key_size}<u{word_size}> key)'). \
                format(nwords = nwords, word_size = word_size,
                       fun_name = self.gen_function_name(nwords, word_size,
                                                         nrounds),
                       key_size = self.get_key_size(nwords))

    def get_key_size(self, nwords):
        return int(nwords/2)

    def gen_func(self, opts, nrounds, word_size, nwords):
        if nwords == 2:
            bump_keys_init = \
            'nsimd::pack<u{word_size}> bump = ' \
            'nsimd::set1(nsimd::pack<u{word_size}>(), {bump});'.\
            format(word_size=word_size, bump = '(u64)0x9E3779B97F4A7C15ULL' \
                   if word_size == 64 else '(u32)0x9E3779B9U')
            bump_keys = 'key.v0 = key.v0 + bump;'

            round_init = '''
            nsimd::pack<u{word_size}> mul =
                nsimd::set1(nsimd::pack<u{word_size}>(), {mul});
            nsimd::pack<u{word_size}> high, low;'''. \
            format(word_size=word_size, mul='(u64)0xD2B74407B1CE6E93ULL' \
                   if word_size == 64 else '(u32)0xD256D193U')

            round='''
              mulhilo{word_size}(mul, in.v0, &low, &high);

              in.v0 = high ^ key.v0 ^ in.v1;
              in.v1 = low;
            '''.format(word_size=word_size)

        elif nwords == 4:
            bump_keys_init = '''
            nsimd::pack<u{word_size}> bump0 =
                nsimd::set1(nsimd::pack<u{word_size}>(), {bump0});
            nsimd::pack<u{word_size}> bump1 =
                nsimd::set1(nsimd::pack<u{word_size}>(), {bump1});'''.\
                format(word_size=word_size,
                       bump0 = '(u64)0x9E3779B97F4A7C15ULL' \
                       if word_size == 64 else '(u32)0x9E3779B9U',
                       bump1 = '(u64)0xBB67AE8584CAA73BULL' \
                       if word_size == 64 else '(u32)0xBB67AE85U')
            bump_keys = 'key.v0 = key.v0 + bump0;\nkey.v1 = key.v1 + bump1;'

            round_init = '''
            nsimd::pack<u{word_size}> mul0 =
                nsimd::set1(nsimd::pack<u{word_size}>(), {mul0});
            nsimd::pack<u{word_size}> mul1 =
                nsimd::set1(nsimd::pack<u{word_size}>(), {mul1});
            nsimd::pack<u{word_size}> low0, high0, low1, high1;
            '''.format(word_size=word_size,
                       mul0='(u64)0xD2E7470EE14C6C93ULL' \
                       if word_size == 64 else '(u32)0xD2511F53U',
                       mul1='(u64)0xCA5A826395121157ULL' \
                       if word_size == 64 else '(u32)0xCD9E8D57U')

            round='''
            mulhilo{word_size}(mul0, in.v0, &low0, &high0);
            mulhilo{word_size}(mul1, in.v2, &low1, &high1);

            in.v0 = high1 ^ key.v0 ^ in.v1;
            in.v1 = low1;
            in.v2 = high0 ^ key.v1 ^ in.v3;
            in.v3 = low0;'''.format(word_size=word_size)


        res = self.gen_signature (nwords, word_size, nrounds)
        res += ' {{ nsimd::packx{}<u{}> out;'.format(nwords, word_size)
        res += bump_keys_init
        res += round_init

        # Round 0:
        res += round;

        for i in range(1, nrounds):
            res += bump_keys
            res += round

        res+='''
            return in;
        }
        '''

        return res

    def generate(self, opts):
        res = self.mullohi

        for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    res += self.gen_func(opts, nrounds, word_size, nwords)

        return res


class ThreeFry(Rand):
    name = 'threefry'

    enums='''
    enum enum_threefry32x2_rotations {
      Rot_32x2_0 = 13,
      Rot_32x2_1 = 15,
      Rot_32x2_2 = 26,
      Rot_32x2_3 = 6,
      Rot_32x2_4 = 17,
      Rot_32x2_5 = 29,
      Rot_32x2_6 = 16,
      Rot_32x2_7 = 24
    };

    enum enum_threefry32x4_rotations {
      Rot_32x4_0_0 = 10,
      Rot_32x4_0_2 = 26,
      Rot_32x4_1_0 = 11,
      Rot_32x4_1_2 = 21,
      Rot_32x4_2_0 = 13,
      Rot_32x4_2_2 = 27,
      Rot_32x4_3_0 = 23,
      Rot_32x4_3_2 = 5,
      Rot_32x4_4_0 = 6,
      Rot_32x4_4_2 = 20,
      Rot_32x4_5_0 = 17,
      Rot_32x4_5_2 = 11,
      Rot_32x4_6_0 = 25,
      Rot_32x4_6_2 = 10,
      Rot_32x4_7_0 = 18,
      Rot_32x4_7_2 = 20
    };

    enum enum_threefry64x2_rotations {
      Rot_64x2_0 = 16,
      Rot_64x2_1 = 42,
      Rot_64x2_2 = 12,
      Rot_64x2_3 = 31,
      Rot_64x2_4 = 16,
      Rot_64x2_5 = 32,
      Rot_64x2_6 = 24,
      Rot_64x2_7 = 21
    };
    enum enum_threefry64x4_rotations {

      Rot_64x4_0_0 = 14,
      Rot_64x4_0_2 = 16,
      Rot_64x4_1_0 = 52,
      Rot_64x4_1_2 = 57,
      Rot_64x4_2_0 = 23,
      Rot_64x4_2_2 = 40,
      Rot_64x4_3_0 = 5,
      Rot_64x4_3_2 = 37,
      Rot_64x4_4_0 = 25,
      Rot_64x4_4_2 = 33,
      Rot_64x4_5_0 = 46,
      Rot_64x4_5_2 = 12,
      Rot_64x4_6_0 = 58,
      Rot_64x4_6_2 = 22,
      Rot_64x4_7_0 = 32,
      Rot_64x4_7_2 = 32
    };
    '''

    # Following macros should not be changed to function : gcc can't inline them
    rotations='''
    #define SHIFT_MOD_32(x, N) ((x << (N & 31)) | (x >> ((32 - N) & 31)))
    #define SHIFT_MOD_64(x, N) ((x << (N & 63)) | (x >> ((64 - N) & 63)))
    '''

    undef_macro='''
    #undef SHIFT_MOD_32
    #undef SHIFT_MOD_64
    '''

    wordsize_nwords_nrounds = {32: {2: [12, 20, 32],
                                    4: [12, 20, 72]},
                               64: {2: [13, 20, 32],
                                    4: [12, 20, 72]}}

    def gen_signature(self, nwords, word_size, nrounds):
        return '''nsimd::packx{nwords}<u{word_size}> \
            {fun_name} \
            (nsimd::packx{nwords}<u{word_size}> in, \
            nsimd::packx{nwords}<u{word_size}> key)'''. \
            format(nwords=nwords, word_size = word_size,
                   fun_name=self.gen_function_name(nwords, word_size, nrounds))

    def get_key_size(self, nwords):
        return nwords

    def gen_body(self, opts, nrounds, word_size, nwords):
        if word_size == 32:
            initialize_keys = '''nsimd::pack<u32> ks{nwords} =
                nsimd::set1(nsimd::pack<u32>(), 0x1BD11BDAU);'''. \
                        format(nwords=nwords)
        elif word_size == 64:
            initialize_keys = '''nsimd::pack<u64> ks{nwords} =
            nsimd::set1(nsimd::pack<u64>(), (u64)0x1BD11BDAA9FC1A22ULL);'''. \
            format(nwords=nwords)

        res = self.gen_signature(nwords, word_size, nrounds)

        res += ' {{ nsimd::packx{}<u{}> out;'.format(nwords, word_size)

        res += initialize_keys

        initialisation_keys = '''
        nsimd::pack<u{word_size}> ks{i};
        ks{i} = key.v{i};
        out.v{i} = in.v{i};
        ks{nwords} = ks{nwords} ^ key.v{i};
        out.v{i} = out.v{i} + key.v{i};
        '''

        for i in range(0,nwords):
            res += initialisation_keys.format(i=i, nwords=nwords,
                                              word_size=word_size)

        for i in range(0, nrounds):
            if nwords == 4:
                indexes= [1 if i%2==0 else 3, 1 if i%2==1 else 3]

                res += '''
                out.v0 = out.v0 + out.v{index0};
                out.v{index0} = SHIFT_MOD_{word_size}(out.v{index0},
                                           Rot_{word_size}x{nwords}_{i_mod}_0);
                out.v{index0} = out.v{index0} ^ out.v0;
                out.v2 = out.v2 + out.v{index1};
                out.v{index1} = SHIFT_MOD_{word_size}(out.v{index1},
                                         Rot_{word_size}x{nwords}_{i_mod}_2);
                out.v{index1} = out.v{index1} ^ out.v2;
                '''.format(index0=indexes[0], index1=indexes[1], i_mod=i%8,
                        word_size=word_size, nwords=nwords)
            elif nwords == 2:
                res += '''
                out.v0 = out.v0 + out.v1;
                out.v1 = SHIFT_MOD_{word_size}(out.v1,
                             Rot_{word_size}x{nwords}_{i_mod});
                out.v1 = out.v1 ^ out.v0;'''. \
                format(i_mod=i % 8, word_size=word_size, nwords=nwords)

            #if (i % nwords) == nwords - 1:
            if (i % 4) == 3:
                d = int(i / 4 + 1)
                res += '\n'
                for j in range(0, nwords):
                    res += 'out.v{j} = out.v{j} + ks{calc};\n'. \
                            format(j=j, calc=str(int((d+j)%(nwords+1))))

                res += 'out.v{n} = out.v{n} + ' \
                       'nsimd::pack<u{word_size}>({d});\n'. \
                       format(d=d, n=nwords-1, word_size=word_size)

        res+='''
            return out;
        }
        '''

        return res

    def generate(self, opts):
        res = ''
        res += self.enums
        res += self.rotations

        for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    res += self.gen_body(opts, nrounds, word_size, nwords)

        res += self.undef_macro

        return res

def gen_functions(opts):
    ## Write source files
    #dirname = os.path.join(opts.include_dir, 'modules', 'random')
    #common.mkdir_p(dirname)
    #filename = os.path.join(dirname, 'functions.cpp')
    #print(filename)
    #with common.open_utf8(opts, filename) as out:
    #    out.write('#include "functions.hpp"\n')
    #    out.write('{}\n\n'.format(common.hbar))
    #    out.write(gen(opts))
    #    out.write('#endif\n')
    #common.clang_format(opts, filename)

    # Write headers
    dirname = os.path.join(opts.include_dir, 'modules', 'random')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, 'functions.hpp')
    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#ifndef NSIMD_MODULES_RANDOM_FUNCTIONS_HPP
           #define NSIMD_MODULES_RANDOM_FUNCTIONS_HPP

           #include <nsimd/nsimd.h>
           #include <nsimd/cxx_adv_api.hpp>
           #include <nsimd/cxx_adv_api_functions.hpp>

           #ifdef NSIMD_LONGLONG_IS_EXTENSION
             #if defined(NSIMD_IS_GCC)
               /* Not emitting the warning -Wlong-long is not possible */
               /* with GCC <= 12. It is a bug. A workaround is to tell GCC   */
               /* to consider this header file as a system header file so    */
               /* that all warnings are not  emitted. This is not satisfying */
               /* but necessary for the moment.                              */
               #pragma GCC system_header
               #pragma GCC diagnostic push
               #pragma GCC diagnostic ignored "-Wlong-long"
             #elif defined(NSIMD_IS_CLANG)
               #pragma clang diagnostic push
               #pragma clang diagnostic ignored "-Wlong-long"
             #endif
           #endif

           namespace nsimd {
           namespace random {

           ''')

        out.write('{}\n\n'.format(common.hbar))
        for func in rand_functions:
            out.write(func.gen_headers(opts))
            out.write(func.generate(opts))

        out.write(
        '''#ifdef NSIMD_LONGLONG_IS_EXTENSION
             #if defined(NSIMD_IS_GCC)
               #pragma GCC diagnostic pop
             #elif defined(NSIMD_IS_CLANG)
               #pragma clang diagnostic pop
             #endif
           #endif

           } // namespace nsimd
           } // namespace random

           #endif
           ''')

    common.clang_format(opts, filename)

def gen_tests(opts):
    for func in rand_functions:
        for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    # Write headers
                    dirname = os.path.join(opts.tests_dir, 'modules', 'random')
                    common.mkdir_p(dirname)
                    filename = os.path.join(dirname, '{}.cpp'. \
                               format(func.gen_function_name(nwords, word_size,
                                                             nrounds)))
                    with common.open_utf8(opts, filename) as out:
                        out.write(func.gen_tests(opts, nrounds, word_size,
                                  nwords))

                    common.clang_format(opts, filename)


# -----------------------------------------------------------------------------

def name():
    return 'Random number generators'

def desc():
    return \
    'This module define functions that generate pseudorandom numbers using' \
    'algorithms described in Parallel Random Numbers: As Easy as 1,2,3, by' \
    'John K. Salmon, Mark A. Moraes, Ron O. Dror and David E.Shaw.'

def gen_doc(opts):
    api =  ''
    for func in rand_functions:
        for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    api += '- `' + func.gen_signature(nwords, word_size,
                                                      nrounds) + '`;  \n'
                    api += '  Returns a random number using the ' \
                           '{func_name} generator\n\n'. \
                           format(func_name=func.name)

    res = '''
# NSIMD Random module overview

{desc}

Two different algorithms are proposed : threefry and philox. Both should give
high quality random number.
Threefry is quicker on CPU, while philox is best used on GPU.

Both algorithms are counter based pseudorandom number generator, meaning that
they need two parameters:
- a key, each key will generate an unique sequence,
- a counter, which will give the different numbers in the sequence.

# NSIMD Random API reference

{api}
'''.format(desc = desc(), api=api)


    filename = common.get_markdown_file(opts, 'overview', 'random')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write(res)

def doc_menu():
    return dict()

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Generating module random')

    if opts.library:
        gen_functions(opts)
    if opts.tests:
        gen_tests(opts)
    if opts.doc:
        gen_doc(opts)


================================================
FILE: egg/modules/spmd/hatch.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import operators
import common
import gen_scalar_utilities
import gen_tests as nsimd_tests

# -----------------------------------------------------------------------------
# CUDA: default number of threads per block

tpb = 128
gpu_params = '(n + {}) / {}, {}'.format(tpb, tpb - 1, tpb)

# -----------------------------------------------------------------------------
# helpers

def append(s1, s2):
    if s1 == '':
        return s2
    if s2 == '':
        return s1
    return s1 + ', ' + s2

k_typ = {'i': 'k_int', 'u': 'k_uint', 'f': 'k_float'}

def get_signature(op):
    args = ', '.join(['a{}'.format(i - 1) for i in range(1, len(op.params))])
    if op.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES or \
       op.name == 'to_mask':
        args = append('to_type', args)
    return '#define k_{}({})'.format(op.name, args)

# -----------------------------------------------------------------------------

def gen_doc_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'spmd')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## What is SPMD?

SPMD stands for *Single Program Multiple Data*. It is a programming paradigm.
It is used by NVIDIA CUDA. Its strengh lies in writing computation kernels.
Basically you concentrate your attention on the kernel itself and not on
how to run it. An example is worth more than a long speech, let's take vector
addition of `float`'s.

```c++
spmd_kernel_1d(add, float *dst, float *a, float *b)
  k_store(dst, k_load(a) + k_load(b));
spmd_kernel_end
```

It would be written as follows for CUDA (assuming that the vector lenghts are
multiples of block's sizes).

```c++
__global__ add(float *dst, float *a, float *b) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  dst[i] = a[i] + b[i];
}
```

NSIMD's SPMD is a small DSL in standard C++98 that can be used to write
computation kernels for GPUs (NVIDIA's and AMD's) and any SIMD units supported
by NSIMD. On a more technical side, the DSL keywords are macros that:
- translates to C-ish keywords for GPUs and
- use masks for CPUs as Intel ISPC (<https://ispc.github.io/>).

The difference between NSIMD's SPMD is that a single code can be compiled
to target GPUs and CPUs whereas:
- NVIDIA CUDA only targets NVIDIA GPUs
- AMD HIP only targets NVIDIA and AMD GPUs
- INTEL ICP only targets Intel SIMD units and ARM NEON

## Writing kernels and device functions

As for CUDA kernels you can write templated and non-templated CUDA kernels.
Declaring a kernel function and launching it is straight forward:

```c++
spmd_kernel_1d(kernel_name, arguments)
  // kernel code
spmd_kernel_end

int main() {

  spmd_launch_kernel_1d(kernel_name, bit_width, param,
                        vector_size, arguments);

  return 0;
}
```

The `bit_width` argument indicates the types width in bits that will be
available inside kernels. The `param` argument indicates the unroll factor for
CPUs and the number of threads per block for GPUs. The `vector_size` argument
indicates the vectors length passed as arguments.

Device functions can also been implemented. They are functions that will
only run on the device. As for kernels, they have the same restrictions.

```c++
spmd_dev_func(k_float device_func, k_float a, k_float b)
  // Device function code
spmd_dev_func_end

spmd_kernel_1d(kernel, arguments)

  // ...

  spmd_call_dev_func(device_func, a, b);

  // ...

spmd_kernel_end
```

The caveat with `spmd_dev_func` is that its first argument must be the return
type followed by the device function name.

It is also possible to write templated kernels. Due to C++ `__VA_ARGS__`
limitations the number of template argument is limited to one of kind
`typename`. If more types or integers are to be passed to device kernels or
functions they have to be boxed inside a struct.

```c++
struct mul_t {
  spmd_dev_func(static k_float dev_impl, k_float a, k_float b)
    return a * b;
  spmd_dev_func_end
};

struct add_t {
  spmd_dev_func(static k_float dev_impl, k_float a, k_float b)
    return a + b;
  spmd_dev_func_end
};

// Op is the template argument (typename Op in C++ code)
spmd_tmpl_dev_func(k_float trampoline, Op, k_float a, k_float b)
  return Op::template spmd_call_dev_func(dev_impl, a, b);
spmd_dev_func_end

// Op is the template argument (typename Op in C++ code)
spmd_tmpl_kernel_1d(tmpl_kernel, Op, arguments)

  // ...

  spmd_call_tmpl_dev_func(trampoline, Op, a, b);

  // ...

spmd_kernel_end

int main() {

  // Kernel call for addition
  spmd_launch_tmpl_kernel_1d(tmpl_kernel, add_t, 32, 1, N, arguments);

  // Kernel call for multiplication
  spmd_launch_tmpl_kernel_1d(tmpl_kernel, mul_t, 32, 1, N, arguments);

  return 0;
}
```

## The NSIMD SPMD C++ DSL

The DSL is of course constraint by C++ syntax and constructs. This implies
some strange syntax and the impossibility to use infix operator `=`.
For now (2020/05/16) the NSIMD SPMD DSL does only supports `if`'s, while-loops
and `returns`. It seems that for-loops and do-while-loops cannot be nicely
proposed, i.e. with a nice syntax, the switch-case keywords cannot be
implemented with a good conformence to the semantic of their C++ counterparts.
Goto's also cannot be implemented properly.

### Variables types available in kernels and device functions

The following self-explanatory variable types are available inside kernels
and devices functions:

- `k_int` for signed integers
- `k_uint` for unsigned integers
- `k_float` for floatting point numbers
- `k_bool` for booleans

As explained above the bit-width of the above types are determined by the
launch kernel function. Note that `k_float` does not exists for 8-bits types.

### Load/store from/to memory

Given a pointer, the proper way to load data is to use `k_load(ptr)`. For
storing a value to memory `k_store` is to be used.

```c++
k_store(ptr, value);
k_store(ptr, expression);
```

As explained above, there is no need to compute the offset to apply to
pointers. This is hidden from the programmer.

### Assignment operator (`operator=`)

Due to C++ ADL (<https://en.cppreference.com/w/cpp/language/adl>) and the
need for keeping things simple for the compiler (which does not always mean
simple for the programmer) the use of infix operator `=` will not produce
a copmilation error but will give incorrect result. You should use `k_set`.

```c++
k_set(var, value);
k_set(var, expression);
```

As written above, `k_set` assign value or the result of an expression to a
variable.

### if, then, else

You should not use plan C++ `if`'s or `else`'s. This will not cause compilation
error but will produce incorrect results at runtime. You should use `k_if`,
`k_else`, `k_elseif` and `k_endif` instead. they have the same semantic as
their C++ counterparts.

```c++
spmd_kernel_1d(if_elseif_else, float *dst, float *a_ptr)

  k_float a, ret;
  k_set(a, k_load(a_ptr));

  k_if (a > 15.0f)

    k_set(ret, 15.0f);

  k_elseif ( a > 10.0f)

    k_set(ret, 10.0f);

  k_elseif ( a > 5.0f)

    k_set(ret, 5.0f);

  k_else

    k_set(ret, 0.0f);

  k_endif

  k_store(dst, ret);

spmd_kernel_end
```

### while loops

You should not use plan C++ `while`'s, `break`'s and `continue`'s. This will
not cause compilation error but will produce incorrect results at runtime.
You should use `k_while`, `k_break`, `k_continue` and `k_endif` instead. They
have the same semantic as their C++ counterparts.

```c++
spmd_kernel_1d(binpow, float *dst, float *a_ptr, int *p_ptr)

  k_float a, ret;
  k_set(a, k_load(a_ptr));
  k_set(ret, 1.0f);
  k_int p;
  k_set(p, k_load(p_ptr));

  k_while(p > 0)

    k_if ((p & 1) != 0)

      k_set(ret, ret * a);

    k_endif

    k_set(a, a * a);
    k_set(p, p >> 1);

  k_endwhile

  k_store(dst, ret);

spmd_kernel_end
```

### Returns

Returns cannot be implemented as macros overloading is not possible in a
standard way with an overload taking zero arguments. So returning has to be
done correctly. The `k_return` keyword has the same semantic as the C++
`return` keyword without arguments and can be used at will for kernels (as
kernels return type is always `void`) and for device functions returning
`void`.

For device functions returning a value it is recommanded to proceed this way:

1. Declare a variable, say `ret`, to store the return value.
2. Whereever you need to return, set the variable appropriately with `k_set`
   and return with `k_return`.
3. At the end of the function use `return ret;`.

```c++
spmd_dev_func(k_int func, k_int a)

  k_float ret;

  k_if (a == 0)
    k_set(ret, 0);
    k_return;
  k_endif

  k_if (a == 1)
    k_set(ret, -1);
    k_return;
  k_endif

  k_set(ret, a);

  return ret;

spmd_dev_func_end
```

## Advanced techniques and functions

This paragraph applies mainly when targeting CPUs. Using techniques described
below won't affect GPUs.

If you are familiar with the SIMD technique of masking to emulate loops and
if's you may know that `k_set` and `k_store` are implemented using respectively
`nsimd::if_else` and `nsimd::maskz_storeu` which may incur performance
penalties. When you know that a simple assignment or store is sufficient
you may use the unmasked variants:

- `k_unmasked_set` translates into a C++ assignment.
- `k_unmasked_store` translates into a C++ SIMD store.

Their arguments are exactly the same as `k_set` and `k_store`. Unmasked
operations can usually be used at the beginning of device functions and also
inside loops, on temporary variables, knowing that the result of the latter
won't be needed later.

You may also use C++ standard keywords and constructs. But be aware that doing
so will apply all the same treatment too all SIMD lanes. This can be useful
when the operations involved are independant of the processed data as in the
example below.

```c++
spmd_dev_func(k_float newton_raphson_sqrt, k_float a, k_float x0)
  k_float ret;
  for (int i = 0; i < 6; i++) {
    k_unmasked_set(ret, (ret + ret * a) / 2.0f);
  }
  return ret;
spmd_dev_func_end
```
''')

# -----------------------------------------------------------------------------

def gen_doc_api(opts):
    filename = common.get_markdown_file(opts, 'api', 'spmd')
    if not common.can_create_filename(opts, filename):
        return

    # Build tree for api.md
    api = dict()
    for _, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    with common.open_utf8(opts, filename) as fout:
        fout.write(
'''# NSIMD SPMD API reference

This page contains the exhaustive API of the SPMD module. Note that most
operators names follow the simple naming `k_[NSIMD name]` and have the same
semantics. This page is light, you may use CTRL+F to find the operator you
are looking for.

For genericity on the base type you should use operator names instead of
infix operators, e.g. `k_add` instead of `+`. Indeed for `f16`'s NVIDIA CUDA
and NSIMD do not provide overloads and therefore code using `+` will fail to
compile.

Note that all operators accept literals and scalars. For example you may
write `k_add(a, 1)` or `float s; k_add(a, s);`. This also applies when
using infix operators. But note that literals or scalars must have the
same type as the other operands.

''')

        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- `{}`  \n'.format(get_signature(op)))
                if op.cxx_operator != None:
                    fout.write('  Infix operator: `{}` ' \
                               '(*for certain types only*)  \n'.\
                               format(op.cxx_operator))
                fout.write('  {}\n\n'.format(op.desc))

# -----------------------------------------------------------------------------

def gen_tests_for_shifts(opts, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#include <nsimd/modules/spmd.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include <nsimd/scalar_utilities.h>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {typ} *a0, int n, int s) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}(a0[i], s);
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, unsigned int n, int s) {{
          kernel<<<{gpu_params}>>>(dst, a0, int(n), s);
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {typ} *a0, size_t n, int s) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}(a0[i], s);
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, size_t n, int s) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n, s);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {typ} *a0, const size_t n,
                           const int s, sycl::nd_item<1> item) {{
          const size_t ii = item.get_global_id().get(0);
          if (ii < n){{
            dst[ii] = nsimd::gpu_{op_name}(a0[ii], s);
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, size_t n, int s) {{
          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
          sycl::queue q_ = nsimd::oneapi::default_queue();
          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                            sycl::range<1>({tpb})),
                                            [=](sycl::nd_item<1> item){{
              kernel(dst, a0, n, s, item);
            }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {typ} *a0, unsigned int n, int s) {{
          for (unsigned int i = 0; i < n; i++) {{
            dst[i] = nsimd::scalar_{op_name}(a0[i], s);
          }}
        }}

        {cbprng_cpu}

        #endif

        // clang-format off

        spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0, int s)
          k_store(dst, k_{op_name}(k_load(a0), s));
        spmd_kernel_end

        // clang-format on

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            for (int s = 0; s < {typnbits}; s++) {{
              int ret = 0;
              {typ} *a0 = nsimd::device_calloc<{typ}>(n);
              random(a0, n, 0);
              {typ} *ref = nsimd::device_calloc<{typ}>(n);
              {typ} *out = nsimd::device_calloc<{typ}>(n);
              spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0, s);
              compute_result(ref, a0, n, s);
              if (!cmp(ref, out, n)) {{
                ret = -1;
              }}
              nsimd::device_free(a0);
              nsimd::device_free(ref);
              nsimd::device_free(out);
              if (ret != 0) {{
                return ret;
              }}
            }}
          }}
          return 0;
        }}
        '''.format(typ=t, op_name=op_name, typnbits=t[1:], tpb=tpb,
                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',
                                                  gpu_params),
                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',
                                                 gpu_params),
                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                                    ['(int)n', str(tpb)]),
                   gpu_params=gpu_params))

    common.clang_format(opts, filename, cuda=True)

# -----------------------------------------------------------------------------

def gen_tests_for_cvt_reinterpret(opts, tt, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}_{}.cpp'.format(op_name, t, tt))
    if not common.can_create_filename(opts, filename):
        return

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#include <nsimd/modules/spmd.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include <nsimd/scalar_utilities.h>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {typ} *a0, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(
                         {totyp}(), a0[i]));
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, a0, int(n));
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {typ} *a0, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(
                         {totyp}(), a0[i]));
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {typ} *a0, const size_t n,
                           sycl::nd_item<1> item) {{
          const size_t ii = item.get_global_id().get(0);
          if (ii < n){{
            dst[ii] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(
                              {totyp}(), a0[ii]));
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, size_t n) {{
          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
          sycl::queue q_ = nsimd::oneapi::default_queue();
          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                            sycl::range<1>({tpb})),
                                            [=](sycl::nd_item<1> item){{
            kernel(dst, a0, n, item);
          }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            dst[i] = nsimd::scalar_{op_name}({typ}(), nsimd::scalar_{op_name}(
                         {totyp}(), a0[i]));
          }}
        }}

        {cbprng_cpu}

        #endif

        // clang-format off

        spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0)
          k_store(dst, k_{op_name}({k_typ}, k_{op_name}({k_totyp},
                  k_load(a0))));
        spmd_kernel_end

        // clang-format on

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {typ} *a0 = nsimd::device_calloc<{typ}>(n);
            random(a0, n, 0);
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0);
            compute_result(ref, a0, n);
            if (!cmp(ref, out, n)) {{
              ret = -1;
            }}
            nsimd::device_free(a0);
            nsimd::device_free(ref);
            nsimd::device_free(out);
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(typ=t, totyp=tt, op_name=op_name, typnbits=t[1:],
                   gpu_params=gpu_params, k_typ=k_typ[t[0]], tpb=tpb,
                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda'),
                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',
                                                 gpu_params),
                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                                    ['(int)n', str(tpb)]),
                   k_totyp=k_typ[tt[0]]))

    common.clang_format(opts, filename, cuda=True)

# -----------------------------------------------------------------------------

def gen_tests_for(opts, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))
    if not common.can_create_filename(opts, filename):
        return

    arity = len(operator.params[1:])
    k_args = ', '.join(['{} *a{}'.format(t, i) for i in range(arity)])
    k_call_args = ', '.join(['a{}'.format(i) for i in range(arity)])

    fill_tabs = '\n'.join(['{typ} *a{i} = nsimd::device_calloc<{typ}>(n);\n' \
                           'random(a{i}, n, {i});'.format(typ=t, i=i) \
                           for i in range(arity)])

    free_tabs = '\n'.join(['nsimd::device_free(a{i});'. \
                           format(typ=t, i=i) for i in range(arity)])

    # spmd
    def get_cte_spmd(typ, cte):
        if typ == 'f16':
            return 'k_f32_to_f16((f32){})'.format(cte)
        else:
            return '({}){}'.format(typ, cte)

    def spmd_load_code(param, typ, i):
        if param == 'l':
            return 'k_lt(k_load(a{}), {})'.format(i, get_cte_spmd(typ, 4))
        if param == 'v':
            return 'k_load(a{})'.format(i)

    args = ', '.join([spmd_load_code(operator.params[i + 1], t, i) \
                      for i in range(arity)])
    if op_name == 'to_mask':
        args = k_typ[t[0]] + ', ' + args
    if operator.params[0] == 'v':
        k_code = 'k_store(dst, k_{}({}));'.format(op_name, args)
    else:
        k_code = '''k_if (k_{}({}))
                      k_store(dst, 1);
                    k_else
                      k_store(dst, 0);
                    k_endif'''.format(op_name, args)

    # gpu
    def get_cte_gpu(typ, cte, target):
        if typ == 'f16' and target == 'cuda_rocm':
            return '__float2half((f32){})'.format(cte)
        else:
            return '({}){}'.format(typ, cte)

    def gpu_load_code(param, typ, i, target):
        if param == 'l':
            return 'nsimd::gpu_lt(a{}[i], {})'. \
                   format(i, get_cte_gpu(typ, 4, target))
        if param == 'v':
            return 'a{}[i]'.format(i)

    args_cuda_rocm = ', '.join([gpu_load_code(operator.params[i + 1], t, i,
                                              'cuda_rocm') \
                                              for i in range(arity)])
    args_oneapi = ', '.join([gpu_load_code(operator.params[i + 1], t, i,
                                           'oneapi') for i in range(arity)])
    if op_name == 'to_mask':
        args_cuda_rocm = t + '(), ' + args_cuda_rocm
        args_oneapi = t + '(), ' + args_oneapi
    if operator.params[0] == 'v':
        cuda_rocm_kernel = 'dst[i] = nsimd::gpu_{}({});'. \
                           format(op_name, args_cuda_rocm)
        oneapi_kernel = 'dst[i] = nsimd::gpu_{}({});'. \
                        format(op_name, args_oneapi)
    else:
        tmpl = '''if (nsimd::gpu_{}({{}})) {{{{
                    dst[i] = {{}};
                  }}}} else {{{{
                    dst[i] = {{}};
                  }}}}'''.format(op_name)
        cuda_rocm_kernel = tmpl.format(args_cuda_rocm,
                                       get_cte_gpu(t, 1, 'cuda_rocm'),
                                       get_cte_gpu(t, 0, 'cuda_rocm'))
        oneapi_kernel = tmpl.format(args_oneapi,
                                    get_cte_gpu(t, 1, 'oneapi'),
                                    get_cte_gpu(t, 0, 'oneapi'))

    # cpu
    def get_cte_cpu(typ, cte):
        if typ == 'f16':
            return 'nsimd_f32_to_f16((f32){})'.format(cte)
        else:
            return '({}){}'.format(typ, cte)

    def cpu_load_code(param, typ, i):
        if param == 'l':
            return 'nsimd::scalar_lt(a{}[i], {})'. \
                   format(i, get_cte_cpu(typ, 4))
        if param == 'v':
            return 'a{}[i]'.format(i)

    args = ', '.join([cpu_load_code(operator.params[i + 1], t, i) \
                      for i in range(arity)])
    if op_name == 'to_mask':
        args = t + '(), ' + args
    if operator.params[0] == 'v':
        cpu_kernel = 'dst[i] = nsimd::scalar_{}({});'.format(op_name, args)
    else:
        cpu_kernel = '''if (nsimd::scalar_{op_name}({args})) {{
                          dst[i] = {one};
                        }} else {{
                          dst[i] = {zero};
                        }}'''.format(op_name=op_name, args=args,
                                     one=get_cte_cpu(t, 1),
                                     zero=get_cte_cpu(t, 0))

    comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \
                                        else ', {}'.format(operator.ufp[t]))

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#include <nsimd/modules/spmd.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include <nsimd/scalar_utilities.h>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {k_args}, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {k_args}, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, {k_call_args}, int(n));
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {k_args}, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {k_args}, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {k_call_args},
                             n);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {k_args}, const size_t n,
                           sycl::nd_item<1> item) {{
          const size_t i = item.get_global_id().get(0);
          if(i < n){{
            {oneapi_kernel}
          }}
        }}

        void compute_result({typ} *dst, {k_args}, size_t n) {{
          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
          sycl::queue q_ = nsimd::oneapi::default_queue();
          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                            sycl::range<1>({tpb})),
                                            [=](sycl::nd_item<1> item){{
            kernel(dst, {k_call_args}, n, item);
          }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {k_args}, unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            {cpu_kernel}
          }}
        }}

        {cbprng_cpu}

        #endif

        // clang-format off

        spmd_kernel_1d(kernel, {typ} *dst, {k_args})
          {k_code}
        spmd_kernel_end

        // clang-format on

        #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
        #define THREADS_PER_BLOCK 128
        #else
        #define THREADS_PER_BLOCK 1
        #endif

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {fill_tabs}
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            spmd_launch_kernel_1d(kernel, {typnbits}, THREADS_PER_BLOCK, n,
                                  out, {k_call_args});
            compute_result(ref, {k_call_args}, n);
            if ({comp}) {{
              ret = -1;
            }}
            nsimd::device_free(ref);
            nsimd::device_free(out);
            {free_tabs}
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(typ=t, free_tabs=free_tabs, fill_tabs=fill_tabs,
                   k_code=k_code, k_call_args=k_call_args, k_args=k_args,
                   cpu_kernel=cpu_kernel, comp=comp,
                   cuda_rocm_kernel=cuda_rocm_kernel,
                   oneapi_kernel=oneapi_kernel,
                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',
                                                  gpu_params),
                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',
                                                 gpu_params),
                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                                    ['(int)n', str(tpb)]),
                   gpu_params=gpu_params, typnbits=t[1:], tpb=tpb))

    common.clang_format(opts, filename, cuda=True)

def gen_tests(opts):
    for op_name, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        not_closed = (operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES \
                      or ('v' not in operator.params[1:] and 'l' not in
                      operator.params[1:]))
        for t in operator.types:
            tts = common.get_output_types(t, operator.output_to)
            for tt in tts:
                if not nsimd_tests.should_i_do_the_test(operator, tt, t):
                    continue
                if operator.name in ['shl', 'shr', 'shra']:
                    gen_tests_for_shifts(opts, t, operator)
                elif operator.name in ['cvt', 'reinterpret', 'reinterpretl']:
                    gen_tests_for_cvt_reinterpret(opts, tt, t, operator)
                else:
                    gen_tests_for(opts, t, operator)

# -----------------------------------------------------------------------------

def gen_functions(opts):
    functions = ''

    for op_name, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue

        if operator.params[0] == 'l':
            s_ret_typ = 'bool'
            v_ret_typ = \
                'nsimd::packl<typename base_type<A0>::type, N>'
        else:
            s_ret_typ = 'T'
            v_ret_typ = 'nsimd::pack<typename base_type<A0>::type, N>'

        def s_typ(typ):
            if typ == 'p':
                return 'int'
            if typ == 'v':
                return 'T'
            if typ == 'l':
                return 'bool'

        s_args = ', '.join(['{} a{}'.format(s_typ(operator.params[i]), i - 1) \
                            for i in range(1, len(operator.params))])
        s_call_args = ', '.join(['a{}'.format(i - 1) \
                                 for i in range(1, len(operator.params))])
        s_tmpl = 'typename T' if 'v' in operator.params[1:] else ''

        def v_typ(typ, i):
            if typ == 'p':
                return 'int'
            if typ in ['v', 'l']:
                return 'A{}'.format(i)
        v_args = ', '.join(['{} a{}'. \
                            format(v_typ(operator.params[i], i - 1), i - 1) \
                            for i in range(1, len(operator.params))])

        def v_call_arg(typ, i):
            if typ == 'p':
                return '(int)a{}'.format(i)
            if typ == 'v':
                return 'spmd::to_pack<T, N>(a{})'.format(i)
            if typ == 'l':
                return 'spmd::to_packl<T, N>(a{})'.format(i)

        v_call_args = ', '.join([v_call_arg(operator.params[i], i - 1) \
                                 for i in range(1, len(operator.params))])

        v_tmpl = ', '.join(['typename A{}'.format(i - 1) \
                            for i in range(1, len(operator.params)) \
                            if operator.params[i] != 'p'])

        m_call_args_cpu = s_call_args
        m_call_args_gpu = s_call_args
        to_type = ''
        ToType = ''
        v_op_name = op_name
        s_op_name = op_name
        template = ''

        # Override for non closed operators
        if operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES or \
           op_name == 'to_mask':
            s_ret_typ = 'ToType'
            s_tmpl = append('typename ToType', s_tmpl)
            m_call_args_gpu = append('to_type()', s_call_args)
            s_call_args = append('ToType()', s_call_args)
            v_tmpl = append('typename ToType', v_tmpl)
            to_type = '<to_type>'
            template = 'template '
            v_ret_typ = 'ToType'
            ToType = '<ToType>'

        # special case for to_mask
        if op_name == 'to_mask':
            v_op_name = 'reinterpret'
            v_call_args = 'to_mask({})'.format(v_call_args)

        if v_tmpl != '':
            v_tmpl = 'template <{}>'.format(v_tmpl)
        if s_tmpl != '':
            s_tmpl = 'template <{}>'.format(s_tmpl)

        functions += \
        '''#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \
               defined(NSIMD_ONEAPI)

           {signature} nsimd::gpu_{s_op_name}({m_call_args_gpu})

           #else

           template <typename KernelType, int N> struct {op_name}_helper {{}};

           template <int N> struct {op_name}_helper<spmd::KernelScalar, N> {{
             {s_tmpl} static {s_ret_typ} impl({s_args}) {{
               return nsimd::scalar_{s_op_name}({s_call_args});
             }}
           }};

           template <int N> struct {op_name}_helper<spmd::KernelSIMD, N> {{
             {v_tmpl} static {v_ret_typ} impl({v_args}) {{
               typedef typename spmd::base_type<A0>::type T;
               return nsimd::{v_op_name}{ToType}({v_call_args});
             }}
           }};

           {signature} \\
               spmd::{op_name}_helper<spmd_KernelType_, \\
                                      spmd_N_>::{template}impl{to_type}( \\
                                        {m_call_args_cpu})

           #endif

           {hbar}

           '''.format(hbar=common.hbar, s_op_name=s_op_name, s_tmpl=s_tmpl,
                      s_ret_typ=s_ret_typ, s_args=s_args, v_args=v_args,
                      v_call_args=v_call_args, s_call_args=s_call_args,
                      v_tmpl=v_tmpl, v_ret_typ=v_ret_typ, ToType=ToType,
                      m_call_args_cpu=m_call_args_cpu, to_type=to_type,
                      v_op_name=v_op_name, op_name=op_name, template=template,
                      m_call_args_gpu=m_call_args_gpu,
                      signature=get_signature(operator))

    # Write the code to file
    dirname = os.path.join(opts.include_dir, 'modules', 'spmd')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, 'functions.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('#ifndef NSIMD_MODULES_SPMD_FUNCTIONS_HPP\n')
        out.write('#define NSIMD_MODULES_SPMD_FUNCTIONS_HPP\n\n')
        out.write('namespace spmd {\n\n')
        out.write('{}\n\n'.format(common.hbar))
        out.write(functions)
        out.write('} // namespace spmd\n\n')
        out.write('#endif\n')
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------

def name():
    return 'SPMD programming'

def desc():
    return '''SPMD programming allows the programmer to focus on kernels and
the compiler to vectorize kernel code more effectively. Basically this
module provides a "à la CUDA" programming C++ DSL to targets CPU SIMD as well
as Intel, NVIDIA and AMD GPUs.'''

def doc_menu():
    return {'Overview': 'overview', 'API reference': 'api'}

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Generating module spmd')
    if opts.library:
        gen_functions(opts)
    if opts.tests:
        gen_tests(opts)
    if opts.doc:
        gen_doc_api(opts)
        gen_doc_overview(opts)


================================================
FILE: egg/modules/tet1d/hatch.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import operators
import common
import gen_scalar_utilities
import gen_tests as nsimd_tests

# -----------------------------------------------------------------------------
# CUDA: default number of threads per block

tpb = 128
gpu_params = '(n + {}) / {}, {}'.format(tpb, tpb - 1, tpb)

def is_not_closed(operator):
    return (operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES \
            or ('v' not in operator.params[1:] and 'l' not in
            operator.params[1:]))

# -----------------------------------------------------------------------------

def gen_doc_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'tet1d')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## What are expression templates?

Expression templates are a C++ template metaprogramming technique that
essentially allows high level programming for loop fusion. Take the following
exemple.

```c++
std::vector<float> operator+(std::vector<float> const &a,
                             std::vector<float> const &b) {{
  std::vector<float> ret(a.size());
  for (size_t i = 0; i < a.size(); i++) {{
    ret[i] = a[i] + b[i];
  }}
  return ret;
}}

int main() {{
  std::vector<float> a, b, c, d, sum;

  ...

  sum = a + b + c + d;

  ...

  return 0;
}}
```

The expression `a + b + c + d` involves three calls to `operator+` and at least
nine memory passes are necessary. This can be optimized as follows.

```c++
int main() {{
  std::vector<float> a, b, c, d, sum;

  ...

  for (size_t i = 0; i < a.size(); i++) {{
    ret[i] = a[i] + b[i] + c[i] + d[i];
  }}

  ...

  return 0;
}}
```

The rewriting above requires only four memory passes which is of course better
but as humans we prefer the writing `a + b + c + d`. Expression templates
solves exactly this problem and allows the programmer to write `a + b + c + d`
and the compiler to see the loop written above.

## Expressions templates with NSIMD

This module provides expression templates on top of NSIMD core. As a
consequence the loops seen by the compiler deduced from the high-level
expressions are optimized using SIMD instructions. Note also that NVIDIA and
AMD GPUs are supported through CUDA and ROCm/HIP. The API for expression
templates in NSIMD is C++98 compatible and is able to work with any container
as its only requirement for data is that it must be contiguous.

All inputs to an expression must be declared using `tet1d::in` while the
output must be declared using `tet1d::out`.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  tet1d::out(a) = tet1d::in(&a[0], a.size()) + tet1d::in(&b[0], b.size());

  ...

  return 0;
}}
```

- `template <typename T, typename I> inline node in(const T *data, I sz);`{nl}
  Construct an input for expression templates starting at address `data` and
  containing `sz` elements. The return type of this functin `node` can be used
  with the help of the `TET1D_IN(T)` macro where `T` if the underlying type of
  data (ints, floats, doubles...).

- `template <typename T> node out(T *data);`{nl}
  Construct an output for expression templates starting at address `data`. Note
  that memory must be allocated by the user before passing it to the expression
  template engine. The output type can be used with the `TET1D_OUT(T)` where
  `T` is the underlying type (ints, floats, doubles...).

Note that it is possible to pass parameters to the expression template engine
to specify the number of threads per block for GPUs or the SIMD extension to
use...

- `template <typename T, typename Pack> node out(T *data, int
  threads_per_block, void *stream);`{nl}
  Construct an output for expression templates starting at address `data`. Note
  that memory must be allocated by the user before passing it to the expression
  template engine. The `Pack` parameter is useful when compiling for CPUs. The
  type is `nsimd::pack<...>` allowing the developper to specify all details
  about the NSIMD packs that will be used by the expression template engine.
  The `threads_per_block` and `stream` arguments are used only when compiling
  for GPUs. Their meaning is contained in their names. The output type can be
  used with the `TET1D_OUT_EX(T, N, SimdExt)` where `T` is the underlying type
  (ints, floats, doubles...), `N` is the unroll factor and `SimdExt` the SIMD
  extension.

Moreover a MATLAB-like syntax is provided. One can select a subrange of given
input. Indexes are understood as for Python: -1 represents the last element.
The contant `tet1d::end = -1` allows one to write portable code.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  TET1D_IN(float) va = tet1d::in(&a[0], a.size());
  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());
  tet1d::out(c) = va(10, tet1d::end - 10) + vb;

  ...

  return 0;
}}
```

One can also specify which elements of the output must be rewritten with
the following syntax.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  TET1D_IN(float) va = tet1d::in(&a[0], a.size());
  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());
  TET1D_OUT(float) vc = tet1d::out(&c[0]);
  vc(va >= 10 && va < 20) = vb;

  ...

  return 0;
}}
```

In the exemple above, element `i` in `vc` is written only if `va[i] >= 10` and
`va[i] < 20`. The expression appearing in the parenthesis can contain
arbitrary expression templates as soon as the underlying type is `bool`.

## Warning using `auto`

Using auto can lead to surprising results. We advice you never to use auto
when dealing with expression templates. Indeed using `auto` will make the
variable an obscure type representing the computation tree of the expression
template. This implies that you won't be able to get data from this variable
i.e. get the `.data` member for exemple. Again this variable or its type cannot
be used in template arguments where you need it.
'''.format(nl='  '))

# -----------------------------------------------------------------------------

def gen_doc_api(opts):
    filename = common.get_markdown_file(opts, 'api', 'tet1d')
    if not common.can_create_filename(opts, filename):
        return

    # Build tree for api.md
    api = dict()
    for _, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    def get_signature(op):
        def get_type(typ):
            if typ == 'p':
                return 'int'
            elif typ == 'v':
                return 'ExprNumber'
            elif typ == 'l':
                return 'ExprBool'
        ret = get_type(op.params[0]) + ' ' + op.name + '('
        if is_not_closed(op):
            ret += 'ToType' + (', ' if len(op.params[1:]) > 0 else '')
        ret += ', '.join(['{{t}} {{in{i}}}'.format(i=i). \
                          format(t=get_type(op.params[i + 1]), in0=common.in0,
                          in1=common.in1, in2=common.in2, in3=common.in3) \
                          for i in range(len(op.params[1:]))])
        ret += ');'
        return ret

    with common.open_utf8(opts, filename) as fout:
        fout.write(
'''# NSIMD TET1D API reference

This page contains the exhaustive API of the TET1D module. Note that most
operators names follow their NSIMD counterparts and have the same
semantics. This page is light, you may use CTRL+F to find the operator you
are looking for.

Note that all operators accept literals and scalars. For example you may
write `tet1d::add(a, 1)`. This also applies when using infix operators. Note
that literals or scalars of different types can be used with expression
involving other types.

In all signature below the following pseudo types are used for simplification:
- `ExprNumber` to designate an existing expression template on signed, unsigned
  integers of floatting point types or a scalar of signed, unsigned integers or
  floatting point types.
- `ExprBool` to designate an existing expression template over booleans or
  a boolean.
- `ToType` to designate a base type (signed, unsigned integers or floatting
  point types) and is used when a change in type is requested for example
  when converting data.

''')

        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- `{}`  \n'.format(get_signature(op)))
                if op.cxx_operator != None:
                    fout.write('  Infix operator: `{}`  \n'. \
                               format(op.cxx_operator[8:]))
                fout.write('  {}\n\n'.format(op.desc))

# -----------------------------------------------------------------------------

def gen_tests_for_shifts(opts, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#include <nsimd/modules/tet1d.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({t} *dst, {t} *tab0, int n, int s) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}(tab0[i], s);
          }}
        }}

        void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{
          kernel<<<{gpu_params}>>>(dst, tab0, int(n), s);
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({t} *dst, {t} *tab0, size_t n, int s) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}(tab0[i], s);
          }}
        }}

        void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, tab0, n, s);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({t} *dst, {t} *tab0, const size_t n,
                           const int s, sycl::nd_item<1> item) {{
          size_t ii = item.get_global_id().get(0);
          if (ii < n){{
            dst[ii] = nsimd::gpu_{op_name}(tab0[ii], s);
          }}
        }}

        void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{
          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
          sycl::queue q_ = nsimd::oneapi::default_queue();
          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                            sycl::range<1>({tpb})),
                                            [=](sycl::nd_item<1> item){{
                                              kernel(dst, tab0, n, s, item);
                                            }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{
          for (unsigned int i = 0; i < n; i++) {{
            dst[i] = nsimd_scalar_{op_name}_{t}(tab0[i], s);
          }}
        }}

        {cbprng_cpu}

        #endif

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            for (int s = 0; s < {typnbits}; s++) {{
              int ret = 0;
              {t} *tab0 = nsimd::device_calloc<{t}>(n);
              random(tab0, n, 0);
              {t} *ref = nsimd::device_calloc<{t}>(n);
              {t} *out = nsimd::device_calloc<{t}>(n);
              compute_result(ref, tab0, n, s);
              tet1d::out(out) = tet1d::{op_name}(tet1d::in(tab0, n), s);
              if (!cmp(ref, out, n)) {{
                ret = -1;
              }}
              nsimd::device_free(ref);
              nsimd::device_free(out);
              nsimd::device_free(tab0);
              if (ret != 0) {{
                return ret;
              }}
            }}
          }}
          return 0;
        }}
        '''.format(gpu_params=gpu_params, op_name=op_name, t=t,
                   typnbits=t[1:], tpb=tpb,
                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',
                                                  gpu_params),
                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',
                                                 gpu_params),
                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                                    ['(int)n', str(tpb)])))
    common.clang_format(opts, filename, cuda=True)

def gen_tests_for(opts, tt, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name,
               t if t == tt else '{}_{}'.format(t, tt)))
    if not common.can_create_filename(opts, filename):
        return

    arity = len(operator.params[1:])
    args_tabs = ', '.join(['{typ} *tab{i}'.format(typ=t, i=i) \
                           for i in range(arity)])
    args_tabs_call = ', '.join(['tab{i}'.format(i=i) \
                                for i in range(arity)])
    args_tabs_i_call = ', '.join(['tab{i}[i]'.format(i=i) \
                                  for i in range(arity)])
    args_in_tabs_call = ', '.join(['tet1d::in(tab{i}, n)'. \
                                   format(i=i) \
                                   for i in range(arity)])

    fill_tabs = '\n'.join(['{typ} *tab{i} = nsimd::device_calloc<{typ}>(n);\n' \
                           'random(tab{i}, n, {i});'.format(typ=t, i=i) \
                           for i in range(arity)])

    free_tabs = '\n'.join(['nsimd::device_free(tab{i});'. \
                           format(typ=t, i=i) for i in range(arity)])

    zero = '{}(0)'.format(t) if t != 'f16' else '{f32_to_f16}(0.0f)'
    one = '{}(1)'.format(t) if t != 'f16' else '{f32_to_f16}(1.0f)'
    comp_tab0_to_1 = 'tab0[i] == {}(1)'.format(t) if t != 'f16' else \
                     '{f16_to_f32}(tab0[i]) == 1.0f'
    comp_tab1_to_1 = 'tab1[i] == {}(1)'.format(t) if t != 'f16' else \
                     '{f16_to_f32}(tab1[i]) == 1.0f'

    if op_name == 'cvt':
        tet1d_code = \
            '''tet1d::out(out) = tet1d::cvt<{t}>(tet1d::cvt<{tt}>(
                                     tet1d::in(tab0, n)));'''. \
                                     format(t=t, tt=tt)
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_cvt({t}(), nsimd::{{p}}_cvt(
                            {tt}(), tab0[i]));'''.format(t=t, tt=tt)
    elif op_name == 'reinterpret':
        tet1d_code = \
            '''tet1d::out(out) = tet1d::reinterpret<{t}>(
                                     tet1d::reinterpret<{tt}>(tet1d::in(
                                         tab0, n)));'''.format(t=t, tt=tt)
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_reinterpret({t}(),
                            nsimd::{{p}}_reinterpret({tt}(),
                                tab0[i]));'''.format(t=t, tt=tt)
    elif op_name in ['to_mask', 'to_logical']:
        tet1d_code = \
            '''tet1d::out(out) = tet1d::to_mask(tet1d::to_logical(tet1d::in(
                                     tab0, n)));'''
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_to_mask({t}(),
                            nsimd::{{p}}_to_logical(tab0[i]));'''. \
                            format(t=t)
    elif operator.params == ['v'] * len(operator.params):
        compute_result_kernel = \
            'dst[i] = nsimd::{{p}}_{op_name}({args_tabs_i_call});'. \
            format(op_name=op_name, args_tabs_i_call=args_tabs_i_call)
        if operator.cxx_operator != None:
            if len(operator.params[1:]) == 1:
                tet1d_code = 'tet1d::out(out) = {cxx_op}tet1d::in(tab0, n);'. \
                             format(cxx_op=operator.cxx_operator)
            else:
                tet1d_code = 'tet1d::out(out) = tet1d::in(tab0, n) {cxx_op} ' \
                             'tet1d::in(tab1, n);'. \
                             format(cxx_op=operator.cxx_operator)
        else:
            tet1d_code = \
                'tet1d::out(out) = tet1d::{op_name}({args_in_tabs_call});'. \
                format(op_name=op_name, args_in_tabs_call=args_in_tabs_call)
    elif operator.params == ['l', 'v', 'v']:
        if operator.cxx_operator != None:
            cond = 'A {} B'.format(operator.cxx_operator)
        else:
            cond = 'tet1d::{}(A, B)'.format(op_name)
        tet1d_code = \
            '''TET1D_OUT({typ}) Z = tet1d::out(out);
               TET1D_IN({typ}) A = tet1d::in(tab0, n);
               TET1D_IN({typ}) B = tet1d::in(tab1, n);
               Z({cond}) = 1;'''.format(cond=cond, typ=t)
        compute_result_kernel = \
            '''if (nsimd::{{p}}_{op_name}(tab0[i], tab1[i])) {{{{
                 dst[i] = {one};
               }}}} else {{{{
                 dst[i] = {zero};
               }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero)
    elif operator.params == ['l'] * len(operator.params):
        if len(operator.params[1:]) == 1:
            if operator.cxx_operator != None:
                cond = '{}(A == 1)'.format(operator.cxx_operator)
            else:
                cond = 'tet1d::{}(A == 1)'.format(op_name)
            tet1d_code = \
                '''TET1D_OUT({typ}) Z = tet1d::out(out);
                   TET1D_IN({typ}) A = tet1d::in(tab0, n);
                   Z({cond}) = 1;'''.format(cond=cond, typ=t)
            compute_result_kernel = \
                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1})) {{{{
                     dst[i] = {one};
                   }}}} else {{{{
                     dst[i] = {zero};
                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,
                                  comp_tab0_to_1=comp_tab0_to_1)
        if len(operator.params[1:]) == 2:
            if operator.cxx_operator != None:
                cond = '(A == 1) {} (B == 1)'.format(operator.cxx_operator)
            else:
                cond = 'tet1d::{}(A == 1, B == 1)'.format(op_name)
            tet1d_code = \
                '''TET1D_OUT({typ}) Z = tet1d::out(out);
                   TET1D_IN({typ}) A = tet1d::in(tab0, n);
                   TET1D_IN({typ}) B = tet1d::in(tab1, n);
                   Z({cond}) = 1;'''.format(cond=cond, typ=t)
            compute_result_kernel = \
                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1},
                                              {comp_tab1_to_1})) {{{{
                     dst[i] = {one};
                   }}}} else {{{{
                     dst[i] = {zero};
                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,
                                  comp_tab0_to_1=comp_tab0_to_1,
                                  comp_tab1_to_1=comp_tab1_to_1)
    else:
        raise Exception('Unsupported operator: "{}"'.format(op_name))

    cpu_kernel = compute_result_kernel.format(p='scalar',
                                              f32_to_f16='nsimd_f32_to_f16',
                                              f16_to_f32='nsimd_f16_to_f32')
    cuda_rocm_kernel = compute_result_kernel.format(p='gpu',
                                                    f32_to_f16='__float2half',
                                                    f16_to_f32='__half2float')
    oneapi_kernel = compute_result_kernel.format(p='gpu',
                                                 f32_to_f16='(f16)',
                                                 f16_to_f32='(f32)')

    comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \
                                        else ', {}'.format(operator.ufp[t]))

    with common.open_utf8(opts, filename) as out:
        out.write(
        '''#include <nsimd/modules/tet1d.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {args_tabs}, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, {args_tabs_call}, int(n));
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {args_tabs}, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {args_tabs_call},
                             n);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {args_tabs}, const size_t n,
                           sycl::nd_item<1> item) {{
          size_t i = item.get_global_id().get(0);
          if (i < n) {{
            {oneapi_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, const size_t n) {{
	  size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
	  sycl::queue q_ = nsimd::oneapi::default_queue();
	  q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
	                                    sycl::range<1>({tpb})),
	                                    [=](sycl::nd_item<1> item){{
            kernel(dst, {args_tabs_call}, n, item);
          }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            {cpu_kernel}
          }}
        }}

        {cbprng_cpu}

        #endif

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {fill_tabs}
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            compute_result(ref, {args_tabs_call}, n);
            {tet1d_code}
            if ({comp}) {{
              ret = -1;
            }}
            nsimd::device_free(ref);
            nsimd::device_free(out);
            {free_tabs}
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(typ=t, args_tabs=args_tabs, fill_tabs=fill_tabs,
                   args_tabs_call=args_tabs_call, gpu_params=gpu_params,
                   free_tabs=free_tabs, tet1d_code=tet1d_code, comp=comp,
                   cpu_kernel=cpu_kernel, tpb=tpb,
                   cuda_rocm_kernel=cuda_rocm_kernel,
                   oneapi_kernel=oneapi_kernel,
                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',
                                                  gpu_params),
                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',
                                                 gpu_params),
                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                                    ['(int)n', str(tpb)])))

    common.clang_format(opts, filename, cuda=True)

def gen_tests(opts):
    for op_name, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        for t in operator.types:
            tts = common.get_output_types(t, operator.output_to)
            for tt in tts:
                if not nsimd_tests.should_i_do_the_test(operator, tt, t):
                    continue
                if operator.name in ['shl', 'shr', 'shra']:
                    gen_tests_for_shifts(opts, t, operator)
                else:
                    gen_tests_for(opts, tt, t, operator)

# -----------------------------------------------------------------------------

def gen_functions(opts):
    functions = ''

    for op_name, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue

        not_closed = is_not_closed(operator)
        not_closed_tmpl_args = 'typename ToType, ' if not_closed else ''
        not_closed_tmpl_params = 'ToType' if not_closed else 'none_t'

        if op_name in ['shl', 'shr', 'shra']:
            tmpl_args = 'typename Left'
            tmpl_params = 'Left, none_t, none_t'
            size = 'return left.size();'
            args = 'Left const &left, int s'
            members = 'Left left; int s;'
            members_assignment = 'ret.left = to_node(left); ret.s = s;'
            to_node_type = 'typename to_node_t<Left>::type, none_t, none_t'
        elif len(operator.params) == 2:
            tmpl_args = not_closed_tmpl_args + 'typename Left'
            tmpl_params = 'Left, none_t, ' + not_closed_tmpl_params
            size = 'return left.size();'
            args = 'Left const &left'
            members = 'Left left;'
            members_assignment = 'ret.left = to_node(left);'
            to_node_type = 'typename to_node_t<Left>::type, none_t, none_t'
        elif len(operator.params) == 3:
            tmpl_args = 'typename Left, typename Right'
            tmpl_params = 'Left, Right, none_t'
            size = 'return compute_size(left.size(), right.size());'
            args = 'Left const &left, Right const &right'
            members = 'Left left;\nRight right;'
            members_assignment = '''ret.left = to_node(left);
                                    ret.right = to_node(right);'''
            to_node_type = 'typename to_node_t<Left>::type, ' \
                           'typename to_node_t<Right>::type, none_t'
        elif len(operator.params) == 4:
            tmpl_args = 'typename Left, typename Right, typename Extra'
            tmpl_params = 'Left, Right, Extra'
            size = \
            'return compute_size(left.size(), right.size(), extra.size());'
            args = 'Left const &left, Right const &right, Extra const &extra'
            members = 'Left left;\nRight right;\nExtra extra;'
            members_assignment = '''ret.left = to_node(left);
                                    ret.right = to_node(right);
                                    ret.extra = to_node(extra);'''
            to_node_type = 'typename to_node_t<Left>::type, ' \
                           'typename to_node_t<Right>::type, ' \
                           'typename to_node_t<Extra>::type'

        if operator.returns == 'v':
            to_pack = 'to_pack_t'
            return_type = 'out_type'
        else:
            to_pack = 'to_packl_t'
            return_type = 'bool'

        if not_closed:
            to_typ_arg = 'out_type(), '
            to_typ_tmpl_arg = '<typename {to_pack}<out_type, Pack>::type>'. \
                              format(to_pack=to_pack)
            in_out_typedefs = '''typedef typename Left::out_type in_type;
                                 typedef ToType out_type;'''
            to_node_type = 'typename to_node_t<Left>::type, none_t, ToType'
        else:
            to_typ_arg = '' if op_name != 'to_mask' else 'out_type(), '
            to_typ_tmpl_arg = ''
            in_out_typedefs = '''typedef typename Left::out_type in_type;
                                 typedef typename Left::out_type out_type;'''

        impl_args = 'left.{cpu_gpu}_get{tmpl}(i)'
        if (len(operator.params[1:]) >= 2):
            if operator.params[2] == 'p':
                impl_args += ', s'
            else:
                impl_args += ', right.{cpu_gpu}_get{tmpl}(i)'
        if (len(operator.params[1:]) >= 3):
            impl_args += ', extra.{cpu_gpu}_get{tmpl}(i)'

        impl_scalar = 'return nsimd::scalar_{}({}{});'. \
                      format(op_name, to_typ_arg,
                             impl_args.format(cpu_gpu='scalar', tmpl=''))

        impl_gpu = 'return nsimd::gpu_{}({}{});'. \
                   format(op_name, to_typ_arg,
                          impl_args.format(cpu_gpu='gpu', tmpl=''))

        impl_simd = 'return nsimd::{}{}({});'. \
                      format(op_name, to_typ_tmpl_arg,
                             impl_args.format(cpu_gpu='template simd',
                                              tmpl='<Pack>'))

        functions += \
        '''struct {op_name}_t {{}};

        template <{tmpl_args}>
        struct node<{op_name}_t, {tmpl_params}> {{
          {in_out_typedefs}

          {members}

          nsimd::nat size() const {{
            {size}
          }}

        #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)
          __device__ {return_type} gpu_get(nsimd::nat i) const {{
            {impl_gpu}
          }}
        #elif defined(NSIMD_ONEAPI)
          {return_type} gpu_get(nsimd::nat i) const {{
            {impl_gpu}
          }}
        #else
          {return_type} scalar_get(nsimd::nat i) const {{
            {impl_scalar}
          }}
          template <typename Pack> typename {to_pack}<out_type, Pack>::type
          simd_get(nsimd::nat i) const {{
            {impl_simd}
          }}
        #endif
        }};

        template<{tmpl_args}>
        node<{op_name}_t, {to_node_type}> {op_name}({args}) {{
          node<{op_name}_t, {to_node_type}> ret;
          {members_assignment}
          return ret;
        }}'''.format(op_name=op_name, tmpl_args=tmpl_args, size=size,
                     tmpl_params=tmpl_params, return_type=return_type,
                     args=args, to_pack=to_pack, to_node_type=to_node_type,
                     members=members, members_assignment=members_assignment,
                     in_out_typedefs=in_out_typedefs,
                     impl_gpu=impl_gpu,
                     impl_scalar=impl_scalar,
                     impl_simd=impl_simd)

        if operator.cxx_operator != None and len(operator.params) == 2:
            functions += \
            '''
            template <typename Op, typename Left, typename Right,
                      typename Extra>
            node<{op_name}_t, node<Op, Left, Right, Extra>, none_t, none_t>
            operator{cxx_operator}(node<Op, Left, Right, Extra> const &node) {{
              return tet1d::{op_name}(node);
            }}'''.format(op_name=op_name,
                         cxx_operator=operator.cxx_operator);
        if operator.cxx_operator != None and len(operator.params) == 3:
            functions += '''

            template <typename Op, typename Left, typename Right,
                      typename Extra, typename T>
            node<{op_name}_t, node<Op, Left, Right, Extra>,
                 node<scalar_t, none_t, none_t,
                      typename node<Op, Left, Right, Extra>::in_type>, none_t>
            operator{cxx_operator}(node<Op, Left, Right, Extra> const &node,
                                   T a) {{
              typedef typename tet1d::node<Op, Left, Right, Extra>::in_type S;
              return tet1d::{op_name}(node, literal_to<S>::impl(a));
            }}

            template <typename T, typename Op, typename Left, typename Right,
                      typename Extra>
            node<{op_name}_t, node<scalar_t, none_t, none_t,
                              typename node<Op, Left, Right, Extra>::in_type>,
                 node<Op, Left, Right, Extra>, none_t>
            operator{cxx_operator}(T a,
                                   node<Op, Left, Right, Extra> const &node) {{
              typedef typename tet1d::node<Op, Left, Right, Extra>::in_type S;
              return tet1d::{op_name}(literal_to<S>::impl(a), node);
            }}

            template <typename LeftOp, typename LeftLeft, typename LeftRight,
                      typename LeftExtra, typename RightOp, typename RightLeft,
                      typename RightRight, typename RightExtra>
            node<{op_name}_t, node<LeftOp, LeftLeft, LeftRight, LeftExtra>,
                              node<RightOp, RightLeft, RightRight, RightExtra>,
                 none_t>
            operator{cxx_operator}(node<LeftOp, LeftLeft, LeftRight,
                                LeftExtra> const &left,
                           node<RightOp, RightLeft, RightRight,
                                RightExtra> const &right) {{
              return tet1d::{op_name}(left, right);
            }}'''.format(op_name=op_name,
                         cxx_operator=operator.cxx_operator);

        functions += '\n\n{}\n\n'.format(common.hbar)

    # Write the code to file
    dirname = os.path.join(opts.include_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, 'functions.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('#ifndef NSIMD_MODULES_TET1D_FUNCTIONS_HPP\n')
        out.write('#define NSIMD_MODULES_TET1D_FUNCTIONS_HPP\n\n')
        out.write('namespace tet1d {\n\n')
        out.write('{}\n\n'.format(common.hbar))
        out.write(functions)
        out.write('} // namespace tet1d\n\n')
        out.write('#endif\n')
    common.clang_format(opts, filename)

# -----------------------------------------------------------------------------

def name():
    return 'Tiny expression templates 1D'

def desc():
    return '''This module provide a thin layer of expression templates above
NSIMD core. It also allows the programmer to target Intel, NVIDIA and AMD GPUs.
Expression template are a C++ technique that allows the programmer to write
code "à la MATLAB" where variables usually represents vectors and operators
are itemwise.'''

def doc_menu():
    return {'Overview': 'overview', 'API reference': 'api'}

# -----------------------------------------------------------------------------

def doit(opts):
    common.myprint(opts, 'Generating module tet1d')
    if opts.library:
        gen_functions(opts)
    if opts.tests:
        gen_tests(opts)
    if opts.doc:
        gen_doc_api(opts)
        gen_doc_overview(opts)


================================================
FILE: egg/oneapi.py
================================================

# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# -----------------------------------------------------------------------------
# References:

# Functions: book:
# Data Parallel C++
# Mastering DPC++ for Programming of Heterogeneous Systems using
# C++ and SYCL - Apress Open
# Table page 475: list of maths functions. float16 supported

# sycl half type (f16) API:
# https://mmha.github.io/syclreference/libraries/types/half/
# -----------------------------------------------------------------------------

import common
import scalar

fmtspec = dict()

# -----------------------------------------------------------------------------

def get_impl_f16(operator, totyp, typ):

    # Case 1: rounding functions
    # no sycl function available for half type
    # sycl function available for f32
    # use sycl defined conversions half --> f32 , f32 --> half

    # Case 2: no sycl function available for half type
    # sycl function available for f32
    # use nsimd casts f32-->f16 + sycl function + f16-->f32

    no_sycl_avail_f16_cast_use_sycl_f32 = \
        ['fma', 'fms', 'fnma', 'fnms', 'min', 'max', 'abs']

    # Case 3: sycl provides functions supporting half type

    sycl_avail_functions_f16 = \
        ['rec', 'rec8', 'rec11', 'rsqrt8', 'rsqrt11', 'rsqrt', 'sqrt']

    # Case 4: sycl half's type provided comparison operators
    # Note:
    # not documented in the book
    # source: sycl half type (f16) API:
    # https://mmha.github.io/syclreference/libraries/types/half/

    sycl_avail_cmp_op_f16 = {
        'lt': 'return {in0} < {in1};',
        'gt': 'return {in0} > {in1};',
        'le': 'return {in0} <= {in1};',
        'ge': 'return {in0} >= {in1};',
        'ne': 'return {in0} != {in1};',
        'eq': 'return {in0} == {in1};'
    }

    # Case 5: no sycl function available for any type
    # use nsimd_scalar_[operator]_f16

    # Dispatch

    # Case 1
    if operator.name in ['floor','ceil','trunc']:
        return 'return f16(sycl::{op}(static_cast<f32>({in0})));'.\
               format(op=operator.name,**fmtspec)
    elif operator.name == 'round_to_even':
        return 'return f16(sycl::rint(static_cast<f32>({in0})));'.\
               format(**fmtspec)

    # Case 2
    elif operator.name in no_sycl_avail_f16_cast_use_sycl_f32:
        if operator.name in ['fma', 'fms', 'fnma', 'fnms']:
            neg = '-' if operator.name in ['fnma', 'fnms'] else ''
            op = '-' if operator.name in ['fnms', 'fms'] else ''
            return '''// cl::sycl::half::operator float
                      f32 x0 = static_cast<f32>({in0});
                      f32 x1 = static_cast<f32>({in1});
                      f32 x2 = static_cast<f32>({in2});
                      f32 res = sycl::fma({neg}x0, x1, {op}x2);
                      // cl::sycl::half::half(const float& f)
                      return f16(res);'''.format(neg=neg, op=op, **fmtspec)
        elif operator.name in ['min', 'max']:
            op = 'fmin' if operator.name == 'min' else 'fmax'
            return '''// cl::sycl::half::operator float
                      f32 x0 =  static_cast<f32>({in0});
                      f32 x1 =  static_cast<f32>({in1});
                      f32 res = sycl::{op}(x0, x1);
                      // cl::sycl::half::half(const float& f)
                      return f16(res);'''.format(op=op, **fmtspec)
        elif operator.name == 'abs':
            return '''// cl::sycl::half::operator float
                      f32 x0 = static_cast<f32>({in0});
                      f32 res = sycl::fabs(x0);
                      // cl::sycl::half::half(const float& f)
                      return f16(res);'''.format(**fmtspec)

    # Case 3
    elif operator.name in sycl_avail_functions_f16:
        if operator.name in ['rec8', 'rec11', 'rec']:
            return '''// sycl::recip available in native form only
                      // availability in half-precision
                      return f16(1.0f / {in0});'''.format(**fmtspec)
        elif operator.name in ['rsqrt8', 'rsqrt11', 'rsqrt']:
            return 'return sycl::rsqrt({in0});'.format(**fmtspec)
        elif operator.name == 'sqrt':
            return 'return sycl::sqrt({in0});'.format(**fmtspec)

    # Case 4
    elif operator.name in sycl_avail_cmp_op_f16:
        return sycl_avail_cmp_op_f16[operator.name].format(**fmtspec)

    # Case 5
    else:
        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                          for i in range(len(operator.params[1:]))])
        return 'return nsimd_scalar_{op}_f16({args});'.\
               format(op=operator.name, args=args)

# -----------------------------------------------------------------------------

def reinterpret(totyp, typ):
    if typ == totyp:
        return 'return {in0};'.format(**fmtspec)
    elif ((typ in common.ftypes and totyp in common.iutypes) or \
         (typ in common.iutypes and totyp in common.ftypes)):
        return 'return nsimd_scalar_reinterpret_{totyp}_{typ}({in0});'. \
               format(**fmtspec)
    else:
        return '''{totyp} ret;
                  memcpy((void *)&ret, (void *)&{in0}, sizeof({in0}));
                  return ret;'''.format(**fmtspec)

# -----------------------------------------------------------------------------

def get_impl(operator, totyp, typ):

    global fmtspec

    fmtspec = {
        'in0': common.in0,
        'in1': common.in1,
        'in2': common.in2,
        'typ': typ,
        'totyp': totyp,
        'typnbits': typ[1:]
    }

    # src operators
    if operator.src:
        oneapi_ops = {
          'sin_u35': 'sin',
          'cos_u35': 'cos',
          'tan_u35': 'tan',
          'asin_u35': 'asin',
          'acos_u35': 'acos',
          'atan_u35': 'atan',
          'atan2_u35': 'atan2',
          'log_u35': 'log',
          'cbrt_u35': 'cbrt',
          'sin_u10': 'sin',
          'cos_u10': 'cos',
          'tan_u10': 'tan',
          'asin_u10': 'asin',
          'acos_u10': 'acos',
          'atan_u10': 'atan',
          'atan2_u10': 'atan2',
          'log_u10': 'log',
          'cbrt_u10': 'cbrt',
          'exp_u10': 'exp',
          'pow_u10': 'pow',
          'sinh_u10': 'sinh',
          'cosh_u10': 'cosh',
          'tanh_u10': 'tanh',
          'sinh_u35': 'sinh',
          'cosh_u35': 'cosh',
          'tanh_u35': 'tanh',
          'fastsin_u3500': 'sin',
          'fastcos_u3500': 'cos',
          'fastpow_u3500': 'pow',
          'asinh_u10': 'asinh',
          'acosh_u10': 'acosh',
          'atanh_u10': 'atanh',
          'exp2_u10': 'exp2',
          'exp2_u35': 'exp2',
          'exp10_u10': 'exp10',
          'exp10_u35': 'exp10',
          'expm1_u10': 'expm1',
          'log10_u10': 'log10',
          'log2_u10': 'log2',
          'log2_u35': 'log2',
          'log1p_u10': 'log1p',
          'sinpi_u05': 'sinpi',
          'cospi_u05': 'cospi',
          'hypot_u05': 'hypot',
          'hypot_u35': 'hypot',
          'remainder': 'remainder',
          'fmod': 'fmod',
          'lgamma_u10': 'lgamma',
          'tgamma_u10': 'tgamma',
          'erf_u10': 'erf',
          'erfc_u15': 'erfc'
        }
        return 'return cl::sycl::{}({});'.format(
                  oneapi_ops[operator.name],
                  common.get_args(len(operator.params[1:])))

    # bool first, no special treatment for f16's
    bool_operators = [ 'andl', 'orl', 'xorl', 'andnotl', 'notl' ]
    if operator.name in bool_operators:
        if operator.name == 'notl':
            return 'return nsimd_scalar_{op}({in0});'.\
                   format(op=operator.name,**fmtspec)
        else:
            return 'return nsimd_scalar_{op}({in0}, {in1});'.\
                   format(op=operator.name,**fmtspec)

    # infix operators no special treatment for f16's
    infix_operators = [ 'orb', 'andb', 'andnotb', 'notb', 'xorb' ]
    if operator.name in infix_operators:
        if operator.name == 'notb':
            return 'return nsimd_scalar_{op}_{typ}({in0});'.\
                   format(op=operator.name,**fmtspec)
        else:
            return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'.\
                   format(op=operator.name,**fmtspec)

    # reinterpret
    if operator.name == 'reinterpret':
        return reinterpret(totyp, typ)

    # cvt
    if operator.name == 'cvt':
        if 'f16' == totyp:
            # conversion op: takes in a 32 bit float and converts it to 16 bits
            return 'return sycl::half(static_cast<f32>({in0}));'. \
                   format(**fmtspec)
        else:
          return 'return nsimd_scalar_cvt_{totyp}_{typ}({in0});'. \
                 format(**fmtspec)

    # to_mask
    if operator.name == 'to_mask':
        return 'return nsimd_scalar_to_mask_{totyp}({in0});'.format(**fmtspec)

    # to_logical
    if operator.name == 'to_logical':
        return 'return nsimd_scalar_to_logical_{typ}({in0});'.format(**fmtspec)

    # for all other operators, f16 has a special treatment
    if typ == 'f16':
        return get_impl_f16(operator, totyp, typ)

    # infix operators - rec - f32, f64
    infix_op_rec_ftypes = ['rec', 'rec8', 'rec11']

    if typ in common.ftypes_no_f16 and operator.name in infix_op_rec_ftypes:
        return '''// sycl::recip available in native form only
                  return 1.0{f} / {in0};'''. \
                  format(f='f' if typ == 'f32' else '', **fmtspec)

    # infix operators - cmp - f32, f64
    infix_op_cmp_f32_f64 = {
        'lt': 'return {cast_to_int}sycl::isless({in0}, {in1});',
        'gt': 'return {cast_to_int}sycl::isgreater({in0}, {in1});',
        'le': 'return {cast_to_int}sycl::islessequal({in0}, {in1});',
        'ge': 'return {cast_to_int}sycl::isgreaterequal({in0}, {in1});',
        'ne': 'return {cast_to_int}sycl::isnotequal({in0}, {in1});',
        'eq': 'return {cast_to_int}sycl::isequal({in0}, {in1});'
    }

    if typ in common.ftypes_no_f16 and operator.name in infix_op_cmp_f32_f64:
        return infix_op_cmp_f32_f64[operator.name]. \
               format(cast_to_int='(int)' if typ == 'f64' else '', **fmtspec)

    # infix operators - cmp - integer types
    infix_op_cmp_iutypes = [ 'lt', 'gt', 'le', 'ge', 'ne', 'eq' ]
    if operator.name in infix_op_cmp_iutypes:
      return 'return nsimd_scalar_{op}_{typ}({in0},{in1});'.\
        format(op=operator.name, **fmtspec)

    # infix operators f32, f64 + integers
    # ref: see Data Parallel C++ book, pages 480, 481, 482
    # TODO: do the functions below call instrinsics/built-in
    # functions on the device?
    # 'add': 'return std::plus<{typ}>()({in0}, {in1});',
    # 'sub': 'return std::minus<{typ}>()({in0}, {in1});',
    # 'mul': 'return std::multiplies<{typ}>()({in0}, {in1});',
    # 'div': 'return std::divides<{typ}>()({in0}, {in1});',

    infix_op_t = [ 'add', 'sub', 'mul', 'div' ]
    if operator.name in infix_op_t:
        return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'. \
               format(op=operator.name, **fmtspec)

    # neg
    # ref: see Data Parallel C++ book, pages 480, 481, 482
    # TODO: does the function below call an instrinsic/built-in
    # function on the device?
    # 'neg': 'return std::negate<{typ}>()({in0});'

    if operator.name == 'neg':
        return 'return nsimd_scalar_{op}_{typ}({in0});'. \
               format(op=operator.name, **fmtspec)

    # shifts
    shifts_op_ui_t = [ 'shl', 'shr', 'shra' ]
    if operator.name in shifts_op_ui_t and typ in common.iutypes:
        return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'. \
               format(op=operator.name, **fmtspec)

    # adds
    if operator.name == 'adds':
        if typ in common.ftypes:
            return 'return nsimd_scalar_add_{typ}({in0}, {in1});'. \
                   format(**fmtspec)
        else:
            return 'return sycl::add_sat({in0}, {in1});'.format(**fmtspec)

    # subs
    if operator.name == 'subs':
        if typ in common.ftypes:
            return 'return nsimd_scalar_sub_{typ}({in0}, {in1});'. \
                   format(**fmtspec)
        else:
            return 'return sycl::sub_sat({in0}, {in1});'.format(**fmtspec)

    # fma's
    if operator.name in ['fma', 'fms', 'fnma', 'fnms']:
        if typ in common.ftypes:
            neg = '-' if operator.name in ['fnma', 'fnms'] else ''
            op = '-' if operator.name in ['fnms', 'fms'] else ''
            return 'return sycl::fma({neg}{in0}, {in1}, {op}{in2});'. \
                   format(op=op, neg=neg, **fmtspec)
        else:
            return 'return nsimd_scalar_{op}_{typ}({in0}, {in1}, {in2});'. \
                   format(op=operator.name, **fmtspec)

    # other operators
    # round_to_even, ceil, floor, trunc, min, max, abs, sqrt

    # round_to_even
    if operator.name == 'round_to_even':
        if typ in common.ftypes_no_f16:
            return 'return sycl::rint({in0});'.format(**fmtspec)
        else:
            return 'return {in0};'.format(**fmtspec)

    # other rounding operators
    other_rounding_ops = ['ceil', 'floor', 'trunc']
    if operator.name in other_rounding_ops:
        if typ in common.iutypes:
            return 'return nsimd_scalar_{op}_{typ}({in0});'. \
                   format(op=operator.name, **fmtspec)
        else:
            return 'return sycl::{op}({in0});'. \
                   format(op=operator.name, **fmtspec)

    # min/max
    if operator.name in ['min', 'max']:
        if typ in common.iutypes:
            return 'return sycl::{op}({in0}, {in1});'.\
                   format(op=operator.name, **fmtspec)
        else:
            op = 'sycl::fmin' if operator.name == 'min' else 'sycl::fmax'
            return 'return {op}({in0}, {in1});'.format(op=op, **fmtspec)

    # abs
    if operator.name == 'abs':
        if typ in common.itypes:
            return 'return ({typ})sycl::abs({in0});'.format(**fmtspec)
        elif typ in common.utypes:
            return 'return nsimd_scalar_abs_{typ}({in0});'.format(**fmtspec)
        else:
            return 'return sycl::fabs({in0});'.format(**fmtspec)

    # sqrt
    if operator.name == 'sqrt' and typ in common.ftypes:
          return 'return sycl::sqrt({in0});'.format(**fmtspec)

    # rsqrt
    if operator.name in ['rsqrt8', 'rsqrt11', 'rsqrt'] and typ in common.ftypes:
          return 'return sycl::rsqrt({in0});'.format(**fmtspec)


================================================
FILE: egg/operators.py
================================================
# Use utf-8 encoding
# -*- coding: utf-8 -*-

# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

if __name__ == 'operators':
    import common
else:
    from . import common
import collections

# -----------------------------------------------------------------------------
# Metaclass and class to gather all operator categories

categories = collections.OrderedDict()

class MAddToCategories(type):
    def __new__(cls, name, bases, dct):
        if name != 'DocCategory':
            if 'title' not in dct:
                raise Exception('No member title provided for class {}'. \
                                format(name))
            dct['name'] = name
            dct['id'] = '/categories/{}'.format(name)
        ret = type.__new__(cls, name, bases, dct)
        if name != 'DocCategory':
            categories[name] = ret()
        return ret

class DocCategory(object, metaclass=MAddToCategories):
    pass

# -----------------------------------------------------------------------------
# Operators categories

class DocShuffle(DocCategory):
    title = 'Shuffle functions'

class DocTrigo(DocCategory):
    title = 'Trigonometric functions'

class DocHyper(DocCategory):
    title = 'Hyperbolic functions'

class DocExpLog(DocCategory):
    title = 'Exponential and logarithmic functions'

class DocBasicArithmetic(DocCategory):
    title = 'Basic arithmetic operators'

class DocBitsOperators(DocCategory):
    title = 'Bits manipulation operators'

class DocLogicalOperators(DocCategory):
    title = 'Logicals operators'

class DocMisc(DocCategory):
    title = 'Miscellaneous'

class DocLoadStore(DocCategory):
    title = 'Loads & stores'

class DocComparison(DocCategory):
    title = 'Comparison operators'

class DocRounding(DocCategory):
    title = 'Rounding functions'

class DocConversion(DocCategory):
    title = 'Conversion operators'

# -----------------------------------------------------------------------------
# Metaclass and class to gather all operators

operators = collections.OrderedDict()

class MAddToOperators(type):
    def __new__(cls, name, bases, dct):

        def member_is_defined(member):
            if member in dct:
                return True
            for bc in range(len(bases)):
                if member in bases[bc].__dict__:
                    return True
            return False

        def get_member_value(member):
            if member in dct:
                return dct[member]
            for bc in range(len(bases)):
                if member in bases[bc].__dict__:
                    return bases[bc].__dict__[member]
            raise Exception('Member does not exists in class {}'.format(name))

        # We don't care about the parent class
        if name == 'Operator' or name == 'SrcOperator':
            return type.__new__(cls, name, bases, dct)

        # Mandatory members
        mm = ['categories', 'signature']
        for m in mm:
            if m not in dct:
                raise Exception('Mandatory member "{}" not given in "{}"'. \
                                format(m, name))

        # Check that all items in categories exists
        for c in dct['categories']:
            if type(c) == str:
                raise Exception( \
                      'Category "{}" must not be a string for operator "{}"'. \
                      format(c, name))
            if not hasattr(c, 'name'):
                raise Exception( \
                      'Category "{}" does not exist for operator "{}"'. \
                      format(c.__class__.__name__, name))
            if c.name not in categories:
                raise Exception( \
                      'Category "{}" does not exist for operator "{}"'. \
                      format(c.__class__.__name__, name))

        # Some defaults, that are fixed by the implementation
        (dct['name'], dct['params']) = common.parse_signature(dct['signature'])
        if 'output_to' in dct:
            if dct['output_to'] == common.OUTPUT_TO_SAME_TYPE:
                dct['closed'] = True
            else:
                dct['closed'] = False
        else:
            dct['closed'] = True
            dct['output_to'] = common.OUTPUT_TO_SAME_TYPE

        # If the operator takes as inputs vectors and returns a scalar, then
        # by default we cannot autogenerate the C++ advanced API because we
        # cannot guess how to combine pieces of a unrolled pack
        if 'autogen_cxx_adv' not in dct:
            if dct['params'][0] in ['p', 's']:
                dct['autogen_cxx_adv'] = False
            else:
                dct['autogen_cxx_adv'] = True

        # By default tests are done on random numbers depending on the type
        # but sometimes one needs to produce only integers even if the
        # type is a floating point type.
        if 'tests_on_integers_only' not in dct:
            dct['tests_on_integers_only'] = False;

        # Fill domain, default is [-20 ; +20]
        if 'domain' not in dct:
            dct['domain'] = [[-20, 20], [-20, 20], [-20, 20]]

        # Number of UFP (cf. documentation) for testing
        if 'ufp' not in dct:
            dct['ufp'] = {'f16': 8, 'f32': 18, 'f64': 45}

        # Check that params is not empty
        if len(dct['params']) == 0:
            raise Exception('"params" is empty for operator "{}"'. \
                            format(name))

        # Fill full_name, default is same as name
        if 'full_name' not in dct:
            dct['full_name'] = name

        # Fill desc, default is a basic sentence using full_name
        if 'desc' not in dct:
            arg = 'arguments' if len(dct['params']) > 2 else 'argument'
            if dct['params'][0] == '_':
                dct['desc'] = '{} the {}.'. \
                              format(dct['full_name'].capitalize(), arg)
            else:
                dct['desc'] = 'Returns the {} of the {}.'.\
                              format(dct['full_name'], arg)

        # Fill src, default is operator is in header not in source
        if not member_is_defined('src'):
            dct['src'] = False

        # Fill load_store, default is operator is not for loading/storing
        if 'load_store' not in dct:
            dct['load_store'] = False

        # Fill has_scalar_impl, default is based on several properties
        if 'has_scalar_impl' not in dct:
            if DocShuffle in dct['categories'] or \
               DocMisc in dct['categories'] or \
               'vx2' in dct['params'] or \
               'vx3' in dct['params'] or \
               'vx4' in dct['params'] or \
               dct['output_to'] in [common.OUTPUT_TO_UP_TYPES,
                                    common.OUTPUT_TO_DOWN_TYPES] or \
               dct['load_store']:
                dct['has_scalar_impl'] = False
            else:
                dct['has_scalar_impl'] = True

        ret = type.__new__(cls, name, bases, dct)
        operators[dct['name']] = ret()
        return ret

class Operator(object, metaclass=MAddToOperators):

    # Default values (for general purpose)
    cxx_operator = None
    autogen_cxx_adv = True
    output_to = common.OUTPUT_TO_SAME_TYPE
    types = common.types
    params = []
    aliases = []
    signature = ''

    # Enable bench by default
    do_bench = True

    # Default values (for documentation)
    desc = ''

    # Defaults values (for benches)
    returns_any_type = False
    bench_auto_against_cpu = True
    bench_auto_against_mipp = False
    bench_auto_against_sleef = False
    bench_auto_against_std = False
    use_for_parsing = True

    @property
    def returns(self):
        return self.params[0]

    @property
    def args(self):
        return self.params[1:]

    def __init__(self):
        (self.name, self.params) = common.parse_signature(self.signature)
        super(Operator, self).__init__()

    def get_return(self):
        return self.params[0]

    def tests_mpfr_name(self):
        return 'mpfr_' + self.name

    def bench_mipp_name(self, typ):
        return 'mipp::{}<{}>'.format(self.name, typ)

    def bench_mipp_types(self):
        return common.ftypes_no_f16

    def bench_sleef_name(self, simd, typ):
        return common.sleef_name(self.name, simd, typ)

    def bench_sleef_types(self):
        return common.ftypes_no_f16

    def bench_std_name(self, simd, typ):
        return 'std::{}'.format(self.name)

    def bench_std_types(self):
        return self.types

    # TODO: move to gen_archis.py
    def get_header_guard(self, platform, simd_ext):
        return 'NSIMD_{}_{}_{}_H'.format(platform.upper(),
            simd_ext.upper(), self.name.upper())

    def get_fmtspec(self, t, tt, simd_ext):
        ret = {}
        return_typ = common.get_one_type_specific(self.params[0], simd_ext, tt)
        ret['return_typ'] = return_typ
        ret['returns'] = '' if return_typ == 'void' else 'return '
        args_list = common.enum([common.get_one_type_specific(p, simd_ext, t)
                                 for p in self.params[1:]])
        if len(args_list) > 0:
            ret['c_args'] = ', '.join(['{} a{}'.format(i[1], i[0])
                                       for i in args_list])
            ret['cxx_args'] = ret['c_args'] + ', '
        else:
            ret['c_args'] = 'void'
            ret['cxx_args'] = ''
        if self.closed:
            ret['cxx_args'] += '{}, {}'.format(t, simd_ext)
        else:
            ret['cxx_args'] += '{}, {}, {}'.format(t, tt, simd_ext)
        ret['vas'] = ', '.join(['a{}'.format(i[0]) for i in args_list])
        ret['suf'] = tt if self.closed else '{}_{}'.format(tt, t)
        ret['name'] = self.name
        ret['hbar'] = common.hbar
        ret['simd_ext'] = simd_ext
        if self.src and 'sleef_symbol_prefix' in self.__class__.__dict__:
            ret['sleef_symbol_prefix'] = self.sleef_symbol_prefix
        return ret

    def get_generic_signature(self, lang):
        if lang == 'c_base':
            vas = common.get_args(len(self.params) - 1)
            args = vas + (', ' if vas != '' else '')
            args += 'from_type, to_type' if not self.closed else 'type'
            return ['#define v{name}({args})'.format(name=self.name,
                    args=args),
                    '#define v{name}_e({args}, simd_ext)'. \
                    format(name=self.name, args=args)]
        elif lang == 'c_adv':
            args = ['a{}'.format(i - 1) for i in range(1, len(self.params))]
            if not self.closed:
                args = ['to_type'] + args
            args = ', '.join(args)
            return '#define nsimd_{}({})'.format(self.name, args)
        elif lang == 'cxx_base':
            def get_type(param, typename):
                if param == '_':
                    return 'void'
                elif param == 'p':
                    return 'int'
                elif param == 's':
                    return typename
                elif param == '*':
                    return '{}*'.format(typename)
                elif param == 'c*':
                    return '{} const*'.format(typename)
                elif param == 'vi':
                    return 'typename simd_traits<typename traits<{}>::itype,' \
                           ' NSIMD_SIMD>::simd_vector'.format(typename)
                elif param == 'l':
                    return \
                    'typename simd_traits<{}, NSIMD_SIMD>::simd_vectorl'. \
                    format(typename)
                elif param.startswith('v'):
                    return \
                    'typename simd_traits<{}, NSIMD_SIMD>::simd_vector{}'. \
                    format(typename, param[1:])
                else:
                    raise ValueError("Unknown param '{}'".format(param))
            return_typ = get_type(self.params[0], 'T')
            args_list = common.enum(self.params[1:])

            if not self.closed :
                tmpl_args = 'NSIMD_CONCEPT_VALUE_TYPE F, ' \
                            'NSIMD_CONCEPT_VALUE_TYPE T'
                typename = 'F'
            else:
                tmpl_args = 'NSIMD_CONCEPT_VALUE_TYPE T'
                typename = 'T'

            temp = ', '.join(['{} a{}'.format(get_type(a[1], typename),
                              a[0]) for a in args_list])
            temp += ', ' if temp != '' else ''
            if not self.closed:
                func_args = temp + 'F, T'
                if self.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES:
                    cxx20_require = \
                        'NSIMD_REQUIRES(sizeof_v<F> == sizeof_v<T>) '
                elif self.output_to == common.OUTPUT_TO_UP_TYPES:
                    cxx20_require = \
                        'NSIMD_REQUIRES(2 * sizeof_v<F> == sizeof_v<T>) '
                else:
                    cxx20_require = \
                        'NSIMD_REQUIRES(sizeof_v<F> == 2 * sizeof_v<T>) '
            else:
                func_args = temp + 'T'
                cxx20_require = ''

            return 'template <{tmpl_args}> {cxx20_require}{return_typ} ' \
                   'NSIMD_VECTORCALL {name}({func_args});'. \
                   format(return_typ=return_typ, tmpl_args=tmpl_args,
                          func_args=func_args, name=self.name,
                          cxx20_require=cxx20_require)
        elif lang == 'cxx_adv':
            def get_type(param, typename, N):
                if param == '_':
                    return 'void'
                elif param == 'p':
                    return 'int'
                elif param == 's':
                    return typename
                elif param == '*':
                    return '{}*'.format(typename)
                elif param == 'c*':
                    return '{} const*'.format(typename)
                elif param == 'vi':
                    return 'pack<typename traits<{}>::itype, {}, SimdExt>'. \
                           format(typename, N)
                elif param == 'l':
                    return 'packl<{}, {}, SimdExt>'.format(typename, N)
                elif param.startswith('v'):
                    return 'pack{}<{}, {}, SimdExt>'. \
                    format(param[1:], typename, N)
                else:
                    raise ValueError("Unknown param '{}'".format(param))
            args_list = common.enum(self.params[1:])
            # Do we need tag dispatching on pack<>? e.g. len, set1 and load*
            inter = [i for i in ['v', 'l', 'vi', 'vx2', 'vx3', 'vx4'] \
                     if i in self.params[1:]]
            tag_dispatching = (inter == [])

            # Compute template arguments
            tmpl_args1 = ['NSIMD_CONCEPT_VALUE_TYPE T',
                          'NSIMD_CONCEPT_SIMD_EXT SimdExt']
            tmpl_argsN = ['NSIMD_CONCEPT_VALUE_TYPE T', 'int N',
                          'NSIMD_CONCEPT_SIMD_EXT SimdExt']
            def get_PACK(arg):
                if arg == 'l':
                    return 'PACKL'
                elif arg == 'v':
                    return 'PACK'
                else:
                    return 'PACK{}'.format(arg[1:].upper())
            if not self.closed:
                tmpl = 'NSIMD_CONCEPT_{} ToPackType'. \
                       format(get_PACK(self.params[0]))
                tmpl_args1 = [tmpl] + tmpl_args1
                tmpl_argsN = [tmpl] + tmpl_argsN
            tmpl_args1 = ', '.join(tmpl_args1)
            tmpl_argsN = ', '.join(tmpl_argsN)

            # Compute function arguments
            def arg_type(arg, typename, N):
                if arg in ['v', 'vi', 'vx2', 'vx3', 'vx4', 'l']:
                    return '{} const&'.format(get_type(arg, typename, N))
                else:
                    return get_type(arg, typename, N)
            args1 = ['{} a{}'.format(arg_type(i[1], 'T', '1'), i[0]) \
                     for i in args_list]
            argsN = ['{} a{}'.format(arg_type(i[1], 'T', 'N'), i[0]) \
                     for i in args_list]

            # Arguments without tag dispatching on pack
            other_argsN = ', '.join(argsN)

            # If we need tag dispatching, then the first argument type
            # is the output type:
            #   1. If not closed, then the output type is ToPackType
            #   2. If closed, then the output type is pack<T, N, SimdExt>
            if not self.closed:
                args1 = ['ToPackType const&'] + args1
                argsN = ['ToPackType const&'] + argsN
            elif tag_dispatching:
                args1 = [arg_type(self.params[0], 'T', '1')] + args1
                argsN = [arg_type(self.params[0], 'T', 'N')] + argsN
            args1 = ', '.join(args1)
            argsN = ', '.join(argsN)

            # Compute return type
            if not self.closed:
                ret1 = 'ToPackType'
                retN = 'ToPackType'
            else:
                ret1 = get_type(self.params[0], 'T', '1')
                retN = get_type(self.params[0], 'T', 'N')

            # For non closed operators that need tag dispatching we have a
            # require clause
            cxx20_require = ''
            if not self.closed:
                tmpl = 'NSIMD_REQUIRES((' \
                    '{}sizeof_v<typename ToPackType::value_type> == ' \
                        '{}sizeof_v<T> && ' \
                    'ToPackType::unroll == {{}} && '\
                    'std::is_same_v<typename ToPackType::simd_ext, SimdExt>))'
                if self.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES:
                    cxx20_require = tmpl.format('', '')
                elif self.output_to == common.OUTPUT_TO_UP_TYPES:
                    cxx20_require = tmpl.format('', '2 * ')
                else:
                    cxx20_require = tmpl.format('2 * ', '')

            ret = { \
                '1': 'template <{tmpl_args1}> {cxx20_require}{ret1} ' \
                     '{cxx_name}({args1});'. \
                     format(tmpl_args1=tmpl_args1,
                            cxx20_require=cxx20_require.format('1'),
                            ret1=ret1, args1=args1, cxx_name=self.name),
                'N': 'template <{tmpl_argsN}> {cxx20_require}{retN} ' \
                     '{cxx_name}({argsN});'. \
                     format(tmpl_argsN=tmpl_argsN,
                            cxx20_require=cxx20_require.format('N'),
                            retN=retN, argsN=argsN, cxx_name=self.name)
            }
            if self.cxx_operator:
                ret.update({ \
                    'op1':
                    '''template <{tmpl_args1}>
                    {ret1} operator{cxx_name}({args1});'''. \
                    format(tmpl_args1=tmpl_args1, ret1=ret1, args1=args1,
                           cxx_name=self.cxx_operator),
                    'opN':
                    '''template <{tmpl_argsN}>
                    {retN} operator{cxx_name}({argsN});'''. \
                    format(tmpl_argsN=tmpl_argsN, retN=retN, argsN=argsN,
                           cxx_name=self.cxx_operator)
                })
            if not self.closed:
                ret['dispatch'] = \
                'template <{tmpl_argsN}> {cxx20_require}{retN} ' \
                '{cxx_name}({other_argsN});'. \
                format(tmpl_argsN=tmpl_argsN,
                       cxx20_require=cxx20_require.format('N'),
                       other_argsN=other_argsN, retN=retN, cxx_name=self.name)
            elif tag_dispatching:
                if [i for i in ['s', '*', 'c*'] if i in self.params[1:]] == []:
                    tmpl_T = ''
                    requires = ''
                else:
                    tmpl_T = ', NSIMD_CONCEPT_VALUE_TYPE T'
                    requires = 'NSIMD_REQUIRES((' \
                        'std::is_same_v<typename SimdVector::value_type, T>))'
                ret['dispatch'] = \
                '''template <NSIMD_CONCEPT_{PACK} SimdVector{tmpl_T}>{requires}
                   SimdVector {cxx_name}({other_argsN});'''.format(
                   PACK=get_PACK(self.params[0]), requires=requires,
                   other_argsN=other_argsN, cxx_name=self.name, tmpl_T=tmpl_T)
            return ret
        else:
            raise Exception('Lang must be one of c_base, cxx_base, cxx_adv')

    def get_signature(self, typename, lang, simd_ext):
        # Check that the type is available for this function
        if typename not in self.types:
            raise Exception('Type {} not supported for function {}'. \
                            format(typename, self.name))

        fmtspec = self.get_fmtspec(typename, typename, simd_ext)

        if lang == 'c_base':
            sig = '{return_typ} NSIMD_VECTORCALL ' \
                  'nsimd_{name}_{simd_ext}_{suf}({c_args})'.format(**fmtspec)
        elif lang == 'cxx_base':
            sig = '{return_typ} NSIMD_VECTORCALL ' \
                  '{name}({cxx_args})'.format(**fmtspec)
        elif lang == 'cxx_adv':
            sig = ''
            raise Exception('TODO cxx_adv for {}'.format(lang))
        else:
            raise Exception('Unknown langage {}'.format(lang))

        return sig

    def get_scalar_signature(self, cpu_gpu, t, tt, lang):
        sig = '__device__ ' if cpu_gpu == 'gpu' else ''
        sig += common.get_one_type_scalar(self.params[0], tt) + ' '
        func_name = 'nsimd_' if lang == 'c' else ''
        func_name += 'gpu_' if cpu_gpu in ['gpu', 'oneapi'] else 'scalar_'
        func_name += self.name
        operator_on_logicals = (self.params == ['l'] * len(self.params))
        if lang == 'c' and not operator_on_logicals:
            func_name += '_{}_{}'.format(tt, t) if not self.closed \
                                                else '_{}'.format(t)
        sig += func_name
        args_list = common.enum([common.get_one_type_scalar(p, t)
                                 for p in self.params[1:]])
        args = ['{} a{}'.format(i[1], i[0]) for i in args_list]
        if lang == 'cxx' and (not self.closed or \
           ('v' not in self.params[1:] and not operator_on_logicals)):
            args = [tt] + args
        sig += '(' + ', '.join(args) + ')'
        return sig

class SrcOperator(Operator):
    src = True
    types = common.ftypes

# -----------------------------------------------------------------------------
# List of functions/operators

class Len(Operator):
    full_name = 'vector length'
    signature = 'p len'
    categories = [DocMisc]

class Set1(Operator):
    full_name = 'value broadcast'
    signature = 'v set1 s'
    categories = [DocMisc]
    desc = 'Returns a vector whose all elements are set to the given value.'

class Set1l(Operator):
    full_name = 'logical value broadcast'
    signature = 'l set1l p'
    categories = [DocMisc]
    desc = 'Returns a vector whose all elements are set to the given ' \
           'boolean value: zero means false and nonzero means true.'

class Loadu(Operator):
    signature = 'v loadu c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from unaligned memory.'

class MaskoLoadu1(Operator):
    signature = 'v masko_loadu1 l c* v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from unaligned memory corresponding to True elements.'

class MaskzLoadu1(Operator):
    signature = 'v maskz_loadu1 l c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from unaligned memory corresponding to True elements.'

class Load2u(Operator):
    full_name = 'load array of structure'
    signature = 'vx2 load2u c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load array of structures of 2 members from unaligned memory.'

class Load3u(Operator):
    full_name = 'load array of structure'
    signature = 'vx3 load3u c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load array of structures of 3 members from unaligned memory.'

class Load4u(Operator):
    full_name = 'load array of structure'
    signature = 'vx4 load4u c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load array of structures of 4 members from unaligned memory.'

class Loada(Operator):
    signature = 'v loada c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from aligned memory.'

class MaskoLoada(Operator):
    signature = 'v masko_loada1 l c* v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from aligned memory.'

class MaskzLoada(Operator):
    signature = 'v maskz_loada1 l c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from aligned memory corresponding to True elements.'

class Load2a(Operator):
    full_name = 'load array of structure'
    signature = 'vx2 load2a c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load array of structures of 2 members from aligned memory.'

class Load3a(Operator):
    full_name = 'load array of structure'
    signature = 'vx3 load3a c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load array of structures of 3 members from aligned memory.'

class Load4a(Operator):
    full_name = 'load array of structure'
    signature = 'vx4 load4a c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load array of structures of 4 members from aligned memory.'

class Loadlu(Operator):
    full_name = 'load vector of logicals'
    signature = 'l loadlu c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from unaligned memory and interpret it as booleans. ' + \
           'Zero is interpreted as False and nonzero as True.'

class Loadla(Operator):
    full_name = 'load vector of logicals'
    signature = 'l loadla c*'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Load data from aligned memory and interpret it as booleans. ' + \
           'Zero is interpreted as False and nonzero as True.'

class Storeu(Operator):
    signature = '_ storeu * v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store SIMD vector into unaligned memory.'

class MaskStoreu1(Operator):
    signature = '_ mask_storeu1 l * v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store active SIMD vector elements into unaligned memory.'

class Store2u(Operator):
    signature = '_ store2u * v v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store 2 SIMD vectors as array of structures of 2 members into ' + \
           'unaligned memory.'

class Store3u(Operator):
    full_name = 'store into array of structures'
    signature = '_ store3u * v v v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store 3 SIMD vectors as array of structures of 3 members into ' + \
           'unaligned memory.'

class Store4u(Operator):
    full_name = 'store into array of structures'
    signature = '_ store4u * v v v v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store 4 SIMD vectors as array of structures of 4 members into ' + \
           'unaligned memory.'

class Storea(Operator):
    signature = '_ storea * v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store SIMD vector into aligned memory.'

class MaskStorea1(Operator):
    signature = '_ mask_storea1 l * v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store active SIMD vector elements into aligned memory.'

class Store2a(Operator):
    full_name = 'store into array of structures'
    signature = '_ store2a * v v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store 2 SIMD vectors as array of structures of 2 members into ' + \
           'aligned memory.'

class Store3a(Operator):
    full_name = 'store into array of structures'
    signature = '_ store3a * v v v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store 3 SIMD vectors as array of structures of 3 members into ' + \
           'aligned memory.'

class Store4a(Operator):
    full_name = 'store into array of structures'
    signature = '_ store4a * v v v v'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store 4 SIMD vectors as array of structures of 4 members into ' + \
           'aligned memory.'

class Gather(Operator):
    full_name = 'gather elements from memory into a SIMD vector'
    signature = 'v gather c* vi'
    load_store = True
    categories = [DocLoadStore]
    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']
    desc = 'Gather elements from memory with base address given as first ' \
           'argument and offsets given as second argument.'

class GatherLinear(Operator):
    full_name = 'gather elements from memory into a SIMD vector'
    signature = 'v gather_linear c* p'
    load_store = True
    categories = [DocLoadStore]
    types = common.types
    desc = 'Gather elements from memory with base address given as first ' \
           'argument and steps given as second argument. This operator ' \
           'using a SIMD register.'

#class MaskzGather(Operator):
#    full_name = 'gather active elements from SIMD vector to memory and put ' \
#                'zeros in inactive elements.'
#    signature = 'v maskz_gather l * vi'
#    load_store = True
#    categories = [DocLoadStore]
#    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']
#    desc = 'Gather elements from memory with base address given as second ' \
#           'argument and offsets given as third argument. Inactive elements ' \
#           '(first argument) are set to zero.'

#class MaskoGather(Operator):
#    full_name = 'gather active elements from SIMD vector to memory and put ' \
#                'zeros in inactive elements.'
#    signature = 'v masko_gather l * vi v'
#    load_store = True
#    categories = [DocLoadStore]
#    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']
#    desc = 'Gather elements from memory with base address given as second ' \
#           'argument and offsets given as third argument. Inactive elements ' \
#           '(first argument) are set to corresponding elements from fourth ' \
#           'argument.'

class Scatter(Operator):
    full_name = 'scatter elements from SIMD vector to memory'
    signature = '_ scatter * vi v'
    load_store = True
    categories = [DocLoadStore]
    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']
    desc = 'Scatter elements from third argument to memory with base ' \
           'address given as first argument and offsets given as second ' \
           'argument.'

class ScatterLinear(Operator):
    full_name = 'scatter elements from SIMD vector to memory'
    signature = '_ scatter_linear * p v'
    load_store = True
    categories = [DocLoadStore]
    types = common.types
    desc = 'Scatter elements from third argument to memory with base ' \
           'address given as first argument and steps given as second ' \
           'argument. This operator avoids using a SIMD register.'

#class MaskScatter(Operator):
#    full_name = 'scatter active elements from SIMD vector to memory'
#    signature = '_ mask_scatter l * vi v'
#    load_store = True
#    categories = [DocLoadStore]
#    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']
#    desc = 'Scatter active (first argument) elements from fourth argument ' \
#           'to memory with base address given as second argument and ' \
#           'offsets given as third argument.'

class Storelu(Operator):
    full_name = 'store vector of logicals'
    signature = '_ storelu * l'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store SIMD vector of booleans into unaligned memory. True is ' + \
           'stored as 1 and False as 0.'

class Storela(Operator):
    full_name = 'store vector of logicals'
    signature = '_ storela * l'
    load_store = True
    categories = [DocLoadStore]
    desc = 'Store SIMD vector of booleans into aligned memory. True is ' + \
           'stored as 1 and False as 0.'

class Orb(Operator):
    full_name = 'bitwise or'
    signature = 'v orb v v'
    cxx_operator = '|'
    categories = [DocBitsOperators]

class Andb(Operator):
    full_name = 'bitwise and'
    signature = 'v andb v v'
    cxx_operator = '&'
    categories = [DocBitsOperators]

class Andnotb(Operator):
    full_name = 'bitwise andnot'
    signature = 'v andnotb v v'
    categories = [DocBitsOperators]
    desc = 'Returns the bitwise andnot of its arguments, more precisely ' \
           '"arg1 and (not arg2)"'

class Notb(Operator):
    full_name = 'bitwise not'
    signature = 'v notb v'
    cxx_operator = '~'
    categories = [DocBitsOperators]

class Xorb(Operator):
    full_name = 'bitwise xor'
    signature = 'v xorb v v'
    cxx_operator = '^'
    categories = [DocBitsOperators]

class Orl(Operator):
    full_name = 'logical or'
    signature = 'l orl l l'
    cxx_operator = '||'
    categories = [DocLogicalOperators]

class Andl(Operator):
    full_name = 'logical and'
    signature = 'l andl l l'
    cxx_operator = '&&'
    categories = [DocLogicalOperators]

class Andnotl(Operator):
    full_name = 'logical andnot'
    signature = 'l andnotl l l'
    categories = [DocLogicalOperators]
    desc = 'Returns the logical andnot of its arguments, more precisely ' \
           '"arg1 and (not arg2)"'

class Xorl(Operator):
    full_name = 'logical xor'
    signature = 'l xorl l l'
    categories = [DocLogicalOperators]

class Notl(Operator):
    full_name = 'logical not'
    signature = 'l notl l'
    cxx_operator = '!'
    categories = [DocLogicalOperators]
    bench_auto_against_std = True

class Add(Operator):
    full_name = 'addition'
    signature = 'v add v v'
    cxx_operator = '+'
    categories = [DocBasicArithmetic]
    bench_auto_against_std = True
    bench_auto_against_mipp = True

class Sub(Operator):
    full_name = 'subtraction'
    signature = 'v sub v v'
    cxx_operator = '-'
    categories = [DocBasicArithmetic]
    bench_auto_against_std = True
    bench_auto_against_mipp = True

class Addv(Operator):
    full_name = 'horizontal sum'
    signature = 's addv v'
    categories = [DocMisc]
    desc = 'Returns the sum of all the elements contained in v'
    do_bench = False
    types = common.ftypes

class Mul(Operator):
    full_name = 'multiplication'
    signature = 'v mul v v'
    cxx_operator = '*'
    categories = [DocBasicArithmetic]

class Div(Operator):
    full_name = 'division'
    signature = 'v div v v'
    cxx_operator = '/'
    domain = [[-20, 20], [0.5, 20]]
    categories = [DocBasicArithmetic]

class Neg(Operator):
    full_name = 'opposite'
    signature = 'v neg v'
    cxx_operator = '-'
    categories = [DocBasicArithmetic]

class Min(Operator):
    full_name = 'minimum'
    signature = 'v min v v'
    categories = [DocBasicArithmetic]

class Max(Operator):
    full_name = 'maximum'
    signature = 'v max v v'
    categories = [DocBasicArithmetic]

class Shr(Operator):
    full_name = 'right shift in zeros'
    signature = 'v shr v p'
    types = common.iutypes
    cxx_operator = '>>'
    categories = [DocBitsOperators]

class Shl(Operator):
    full_name = 'left shift'
    signature = 'v shl v p'
    types = common.iutypes
    cxx_operator = '<<'
    categories = [DocBitsOperators]

class Shra(Operator):
    full_name = 'arithmetic right shift'
    signature = 'v shra v p'
    types = common.iutypes
    categories = [DocBitsOperators]
    desc = 'Performs a right shift operation with sign extension.'

class Eq(Operator):
    full_name = 'compare for equality'
    signature = 'l eq v v'
    cxx_operator = '=='
    categories = [DocComparison]

class Ne(Operator):
    full_name = 'compare for inequality'
    signature = 'l ne v v'
    cxx_operator = '!='
    categories = [DocComparison]
    desc = 'Compare the inputs for inequality.'

class Gt(Operator):
    full_name = 'compare for greater-than'
    signature = 'l gt v v'
    cxx_operator = '>'
    categories = [DocComparison]
    desc = 'Compare the inputs for greater-than.'

class Ge(Operator):
    full_name = 'compare for greater-or-equal-than'
    signature = 'l ge v v'
    cxx_operator = '>='
    categories = [DocComparison]
    desc = 'Compare the inputs for greater-or-equal-than.'

class Lt(Operator):
    full_name = 'compare for lesser-than'
    signature = 'l lt v v'
    cxx_operator = '<'
    categories = [DocComparison]
    desc = 'Compare the inputs for lesser-than.'

class Le(Operator):
    full_name = 'compare for lesser-or-equal-than'
    signature = 'l le v v'
    cxx_operator = '<='
    categories = [DocComparison]
    desc = 'Compare the inputs for lesser-or-equal-than.'

class If_else1(Operator):
    full_name = 'blend'
    signature = 'v if_else1 l v v'
    categories = [DocMisc]
    desc = 'Blend the inputs using the vector of logical as a first ' + \
           'argument. Elements of the second input is taken when the ' + \
           'corresponding elements from the vector of logicals is true, ' + \
           'otherwise elements of the second input are taken.'

class Abs(Operator):
    full_name = 'absolute value'
    signature = 'v abs v'
    categories = [DocBasicArithmetic]

class Fma(Operator):
    full_name = 'fused multiply-add'
    signature = 'v fma v v v'
    categories = [DocBasicArithmetic]
    desc = 'Multiply the first and second inputs and then adds the third ' + \
           'input.'
    tests_on_integers_only = True

class Fnma(Operator):
    full_name = 'fused negate-multiply-add'
    signature = 'v fnma v v v'
    categories = [DocBasicArithmetic]
    desc = 'Multiply the first and second inputs, negate the intermediate ' + \
           'result and then adds the third input.'
    tests_on_integers_only = True

class Fms(Operator):
    full_name = 'fused multiply-substract'
    signature = 'v fms v v v'
    categories = [DocBasicArithmetic]
    desc = 'Substracts the third input to multiplication the first and ' + \
           'second inputs.'
    tests_on_integers_only = True

class Fnms(Operator):
    full_name = 'fused negate-multiply-substract'
    signature = 'v fnms v v v'
    categories = [DocBasicArithmetic]
    desc = 'Multiply the first and second inputs, negate the intermediate ' + \
           'result and then substracts the third input to the ' + \
           'intermediate result.'
    tests_on_integers_only = True

class Ceil(Operator):
    full_name = 'rounding up to integer value'
    signature = 'v ceil v'
    categories = [DocRounding]

class Floor(Operator):
    full_name = 'rounding down to integer value'
    signature = 'v floor v'
    categories = [DocRounding]

class Trunc(Operator):
    full_name = 'rounding towards zero to integer value'
    signature = 'v trunc v'
    categories = [DocRounding]

class Round_to_even(Operator):
    full_name = 'rounding to nearest integer value, tie to even'
    signature = 'v round_to_even v'
    categories = [DocRounding]

class All(Operator):
    full_name = 'check all elements'
    signature = 'p all l'
    categories = [DocMisc]
    desc = 'Return true if and only if all elements of the inputs are true.'

class Any(Operator):
    full_name = 'check for one true elements'
    signature = 'p any l'
    categories = [DocMisc]
    desc = 'Return true if and only if at least one element of the inputs ' + \
           'is true.'

class Nbtrue(Operator):
    full_name = 'count true elements'
    signature = 'p nbtrue l'
    categories = [DocMisc]
    desc = 'Return the number of true elements in the input.'

class Reinterpret(Operator):
    full_name = 'reinterpret vector'
    signature = 'v reinterpret v'
    output_to = common.OUTPUT_TO_SAME_SIZE_TYPES
    categories = [DocConversion]
    desc = 'Reinterpret input vector into a different vector type ' + \
           'preserving all bits.'

class Reinterpretl(Operator):
    full_name = 'reinterpret vector of logicals'
    signature = 'l reinterpretl l'
    categories = [DocConversion]
    output_to = common.OUTPUT_TO_SAME_SIZE_TYPES
    has_scalar_impl = False
    desc = 'Reinterpret input vector of logicals into a different vector ' + \
           'type of logicals preserving all elements values. The output ' + \
           'type must have same length as input type.'

class Cvt(Operator):
    full_name = 'convert vector'
    signature = 'v cvt v'
    output_to = common.OUTPUT_TO_SAME_SIZE_TYPES
    categories = [DocConversion]
    desc = 'Convert input vector into a different vector type. The output ' + \
           'type must have same length as input type.'

class Upcvt(Operator):
    full_name = 'convert vector to larger type'
    signature = 'vx2 upcvt v'
    output_to = common.OUTPUT_TO_UP_TYPES
    types = ['i8', 'u8', 'i16', 'u16', 'f16', 'i32', 'u32', 'f32']
    categories = [DocConversion]
    desc = 'Convert input vector into a different larger vector type. The ' + \
           'output type must be twice as large as the input type.'

class Downcvt(Operator):
    full_name = 'convert vector to narrow type'
    signature = 'v downcvt v v'
    output_to = common.OUTPUT_TO_DOWN_TYPES
    types = ['i16', 'u16', 'f16', 'i32', 'u32', 'f32', 'i64', 'u64', 'f64']
    categories = [DocConversion]
    desc = 'Convert input vector into a different narrow vector type. The ' + \
           'output type must be twice as less as the input type.'

class Rec(Operator):
    full_name = 'reciprocal'
    signature = 'v rec v'
    types = common.ftypes
    domain = [[-20, -0.5, 0.5, 20]]
    categories = [DocBasicArithmetic]

class Rec11(Operator):
    full_name = 'reciprocal with relative error at most $2^{-11}$'
    signature = 'v rec11 v'
    types = common.ftypes
    categories = [DocBasicArithmetic]
    domain = [[-20, -0.5, 0.5, 20]]
    ufp = { 'f16': 10, 'f32': 10, 'f64': 10 }

class Rec8(Operator):
    full_name = 'reciprocal with relative error at most $2^{-8}$'
    signature = 'v rec8 v'
    types = common.ftypes
    categories = [DocBasicArithmetic]
    domain = [[-20, -0.5, 0.5, 20]]
    ufp = { 'f16': 7, 'f32': 7, 'f64': 7 }

class Sqrt(Operator):
    full_name = 'square root'
    signature = 'v sqrt v'
    types = common.ftypes
    domain = [[0, 20]]
    categories = [DocBasicArithmetic]

class Rsqrt11(Operator):
    full_name = 'square root with relative error at most $2^{-11}$'
    signature = 'v rsqrt11 v'
    types = common.ftypes
    domain = [[0.5, 20]]
    ufp = { 'f16': 10, 'f32': 10, 'f64': 10 }
    categories = [DocBasicArithmetic]

class Rsqrt8(Operator):
    full_name = 'square root with relative error at most $2^{-8}$'
    signature = 'v rsqrt8 v'
    types = common.ftypes
    domain = [[0.5, 20]]
    ufp = { 'f16': 7, 'f32': 7, 'f64': 7 }
    categories = [DocBasicArithmetic]

class Ziplo(Operator):
    full_name = 'zip low halves'
    signature = 'v ziplo v v'
    types = common.types
    categories = [DocShuffle]
    desc = 'Construct a vector where elements of the first low half input ' + \
           'are followed by the corresponding element of the second low ' + \
           'half input.'

class Ziphi(Operator):
    full_name = 'zip high halves'
    signature = 'v ziphi v v'
    types = common.types
    categories = [DocShuffle]
    desc = 'Construct a vector where elements of the first high half ' + \
           'input are followed by the corresponding element of the second ' + \
           'high half input.'

class Unziplo(Operator):
    full_name = 'unziplo'
    signature = 'v unziplo v v'
    types = common.types
    categories = [DocShuffle]

class Unziphi(Operator):
    full_name = 'unziphi'
    signature = 'v unziphi v v'
    types = common.types
    categories = [DocShuffle]

class Zip(Operator):
    full_name = 'zip'
    signature = 'vx2 zip v v'
    types = common.types
    categories = [DocShuffle]

class Unzip(Operator):
    full_name = 'unzip'
    signature = 'vx2 unzip v v'
    types = common.types
    categories = [DocShuffle]

class ToMask(Operator):
    full_name = 'build mask from logicals'
    signature = 'v to_mask l'
    categories = [DocLogicalOperators]
    desc = 'Returns a mask consisting of all ones for true elements and ' + \
           'all zeros for false elements.'

class ToLogical(Operator):
    full_name = 'build logicals from data'
    signature = 'l to_logical v'
    categories = [DocLogicalOperators]
    desc = 'Returns a vector of logicals. Set true when the corresponding ' + \
           'elements are non zero (at least one bit to 1) and false ' + \
           'otherwise.'

class Iota(Operator):
    full_name = 'fill vector with increasing values'
    signature = 'v iota'
    categories = [DocMisc]
    desc = 'Returns a vectors whose first element is zero, the second is ' \
           'one and so on.'

class MaskForLoopTail(Operator):
    full_name = 'build mask for ending loops'
    signature = 'l mask_for_loop_tail p p'
    categories = [DocMisc]
    desc = 'Returns a mask for loading/storing data at loop tails by ' \
           'setting the first elements to True and the last to False. ' \
           'The first argument is index in a loop whose number of elements ' \
           'is given by the second argument.'

class Adds(Operator):
    full_name = 'addition using saturation'
    signature = 'v adds v v'
    categories = [DocBasicArithmetic]
    desc = 'Returns the saturated sum of the two vectors given as arguments'

class Subs(Operator):
    full_name = 'subtraction using saturation'
    signature = 'v subs v v'
    categories = [DocBasicArithmetic]
    desc = 'Returns the saturated subtraction of the two vectors given as ' \
           'arguments'

class Sin_u35(SrcOperator):
    full_name = 'sine'
    signature = 'v sin_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_sin_u35'
    categories = [DocTrigo]
    desc = 'Compute the sine of its argument with a precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cos_u35(SrcOperator):
    full_name = 'cosine'
    signature = 'v cos_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_cos_u35'
    categories = [DocTrigo]
    desc = 'Compute the cosine of its argument with a precision of ' \
           '3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Tan_u35(SrcOperator):
    full_name = 'tangent'
    signature = 'v tan_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_tan_u35'
    domain = [[-4.7, -1.6, -1.5, 1.5, 1.6, 4.7]]
    categories = [DocTrigo]
    desc = 'Compute the tangent of its argument with a precision of ' \
           '3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Asin_u35(SrcOperator):
    full_name = 'arcsine'
    signature = 'v asin_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_asin_u35'
    domain = [[-0.9, 0.9]]
    categories = [DocTrigo]
    desc = 'Compute the arcsine of its argument with a precision of ' \
           '3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Acos_u35(SrcOperator):
    full_name = 'arccosine'
    signature = 'v acos_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_acos_u35'
    domain = [[-0.9, 0.9]]
    categories = [DocTrigo]
    desc = 'Compute the arccosine of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Atan_u35(SrcOperator):
    full_name = 'arctangent'
    signature = 'v atan_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_atan_u35'
    categories = [DocTrigo]
    desc = 'Compute the arctangent of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Atan2_u35(SrcOperator):
    full_name = 'arctangent'
    signature = 'v atan2_u35 v v'
    sleef_symbol_prefix = 'nsimd_sleef_atan2_u35'
    domain = [[-20, 20], [-20, -0.5, 0.5, 20]]
    categories = [DocTrigo]
    desc = 'Compute the arctangent of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Log_u35(SrcOperator):
    full_name = 'natural logarithm'
    signature = 'v log_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_log_u35'
    domain = [[0.5, 20]]
    categories = [DocExpLog]
    desc = 'Compute the natural logarithm of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cbrt_u35(SrcOperator):
    full_name = 'cube root'
    signature = 'v cbrt_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_cbrt_u35'
    categories = [DocBasicArithmetic]
    desc = 'Compute the cube root of its argument with a precision of ' \
           '3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Sin_u10(SrcOperator):
    full_name = 'sine'
    signature = 'v sin_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_sin_u10'
    categories = [DocTrigo]
    desc = 'Compute the sine of its argument with a precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cos_u10(SrcOperator):
    full_name = 'cosine'
    signature = 'v cos_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_cos_u10'
    categories = [DocTrigo]
    desc = 'Compute the cosine of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Tan_u10(SrcOperator):
    full_name = 'tangent'
    signature = 'v tan_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_tan_u10'
    domain = [[-4.7, -1.6, -1.5, 1.5, 1.6, 4.7]]
    categories = [DocTrigo]
    desc = 'Compute the tangent of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Asin_u10(SrcOperator):
    full_name = 'arcsine'
    signature = 'v asin_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_asin_u10'
    domain = [[-0.9, 0.9]]
    categories = [DocTrigo]
    desc = 'Compute the arcsine of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Acos_u10(SrcOperator):
    full_name = 'arccosine'
    signature = 'v acos_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_acos_u10'
    domain = [[-0.9, 0.9]]
    categories = [DocTrigo]
    desc = 'Compute the arccosine of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Atan_u10(SrcOperator):
    full_name = 'arctangent'
    signature = 'v atan_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_atan_u10'
    categories = [DocTrigo]
    desc = 'Compute the arctangent of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Atan2_u10(SrcOperator):
    full_name = 'arctangent'
    signature = 'v atan2_u10 v v'
    sleef_symbol_prefix = 'nsimd_sleef_atan2_u10'
    domain = [[-20, 20], [-20, -0.5, 0.5, 20]]
    categories = [DocTrigo]
    desc = 'Compute the arctangent of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Log_u10(SrcOperator):
    full_name = 'natural logarithm'
    signature = 'v log_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_log_u10'
    domain = [[0.5, 20]]
    categories = [DocExpLog]
    desc = 'Compute the natural logarithm of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cbrt_u10(SrcOperator):
    full_name = 'cube root'
    signature = 'v cbrt_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_cbrt_u10'
    categories = [DocBasicArithmetic]
    desc = 'Compute the cube root of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Exp_u10(SrcOperator):
    full_name = 'base-e exponential'
    signature = 'v exp_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_exp_u10'
    domain = [[-20, 5]]
    categories = [DocExpLog]
    desc = 'Compute the base-e exponential of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Pow_u10(SrcOperator):
    full_name = 'power'
    signature = 'v pow_u10 v v'
    sleef_symbol_prefix = 'nsimd_sleef_pow_u10'
    domain = [[0, 5], [-5, 5]]
    categories = [DocExpLog]
    desc = 'Compute the power of its argument with a precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Sinh_u10(SrcOperator):
    full_name = 'hyperbolic sine'
    signature = 'v sinh_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_sinh_u10'
    categories = [DocHyper]
    desc = 'Compute the hyperbolic sine of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cosh_u10(SrcOperator):
    full_name = 'hyperbolic cosine'
    signature = 'v cosh_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_cosh_u10'
    categories = [DocHyper]
    desc = 'Compute the hyperbolic cosine of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Tanh_u10(SrcOperator):
    full_name = 'hyperbolic tangent'
    signature = 'v tanh_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_tanh_u10'
    categories = [DocHyper]
    desc = 'Compute the hyperbolic tangent of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Sinh_u35(SrcOperator):
    full_name = 'hyperbolic sine'
    signature = 'v sinh_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_sinh_u35'
    categories = [DocHyper]
    desc = 'Compute the hyperbolic sine of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cosh_u35(SrcOperator):
    full_name = 'hyperbolic cosine'
    signature = 'v cosh_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_cosh_u35'
    categories = [DocHyper]
    desc = 'Compute the hyperbolic cosine of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Tanh_u35(SrcOperator):
    full_name = 'hyperbolic tangent'
    signature = 'v tanh_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_tanh_u35'
    categories = [DocHyper]
    desc = 'Compute the hyperbolic tangent of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Asinh_u10(SrcOperator):
    full_name = 'inverse hyperbolic sine'
    signature = 'v asinh_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_asinh_u10'
    categories = [DocHyper]
    desc = 'Compute the inverse hyperbolic sine of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Acosh_u10(SrcOperator):
    full_name = 'inverse hyperbolic cosine'
    signature = 'v acosh_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_acosh_u10'
    categories = [DocHyper]
    domain = [[1, 20]]
    desc = 'Compute the inverse hyperbolic cosine of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Atanh_u10(SrcOperator):
    full_name = 'inverse hyperbolic tangent'
    signature = 'v atanh_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_atanh_u10'
    domain = [[-0.9, 0.9]]
    categories = [DocHyper]
    desc = 'Compute the inverse hyperbolic tangent of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Exp2_u10(SrcOperator):
    full_name = 'base-2 exponential'
    signature = 'v exp2_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_exp2_u10'
    domain = [[-20, 5]]
    categories = [DocExpLog]
    desc = 'Compute the base-2 exponential of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Exp2_u35(SrcOperator):
    full_name = 'base-2 exponential'
    signature = 'v exp2_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_exp2_u35'
    domain = [[-20, 5]]
    categories = [DocExpLog]
    desc = 'Compute the base-2 exponential of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Exp10_u10(SrcOperator):
    full_name = 'base-10 exponential'
    signature = 'v exp10_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_exp10_u10'
    domain = [[-5, 3]]
    categories = [DocExpLog]
    desc = 'Compute the base-10 exponential of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Exp10_u35(SrcOperator):
    full_name = 'base-10 exponential'
    signature = 'v exp10_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_exp10_u35'
    domain = [[-5, 3]]
    categories = [DocExpLog]
    desc = 'Compute the base-10 exponential of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Expm1_u10(SrcOperator):
    full_name = 'exponential minus 1'
    signature = 'v expm1_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_expm1_u10'
    domain = [[-5, 3]]
    categories = [DocExpLog]
    desc = 'Compute the exponential minus 1 of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Log10_u10(SrcOperator):
    full_name = 'base-10 logarithm'
    signature = 'v log10_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_log10_u10'
    domain = [[0.5, 20]]
    categories = [DocExpLog]
    desc = 'Compute the base-10 logarithm of its argument with a precision ' \
           'of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Log2_u10(SrcOperator):
    full_name = 'base-2 logarithm'
    signature = 'v log2_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_log2_u10'
    domain = [[0.5, 20]]
    categories = [DocExpLog]
    desc = 'Compute the base-2 logarithm of its argument with a precision ' \
           'of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Log2_u35(SrcOperator):
    full_name = 'base-2 logarithm'
    signature = 'v log2_u35 v'
    sleef_symbol_prefix = 'nsimd_sleef_log2_u35'
    domain = [[0.5, 20]]
    categories = [DocExpLog]
    desc = 'Compute the base-2 logarithm of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Log1p_u10(SrcOperator):
    full_name = 'logarithm of 1 plus argument'
    signature = 'v log1p_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_log1p_u10'
    domain = [[-0.5, 19]]
    categories = [DocExpLog]
    desc = 'Compute the logarithm of 1 plus argument of its argument with ' \
           'a precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Sinpi_u05(SrcOperator):
    full_name = 'sine of pi times argument'
    signature = 'v sinpi_u05 v'
    sleef_symbol_prefix = 'nsimd_sleef_sinpi_u05'
    categories = [DocTrigo]
    desc = 'Compute the sine of pi times argument of its argument with a ' \
           'precision of 0.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Cospi_u05(SrcOperator):
    full_name = 'cosine of pi times argument'
    signature = 'v cospi_u05 v'
    sleef_symbol_prefix = 'nsimd_sleef_cospi_u05'
    categories = [DocTrigo]
    desc = 'Compute the cosine of pi times argument of its argument with ' \
           'a precision of 0.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Hypot_u05(SrcOperator):
    full_name = 'Euclidean distance'
    signature = 'v hypot_u05 v v'
    sleef_symbol_prefix = 'nsimd_sleef_hypot_u05'
    categories = [DocBasicArithmetic]
    desc = 'Compute the Euclidean distance of its argument with a ' \
           'precision of 0.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Hypot_u35(SrcOperator):
    full_name = 'Euclidean distance'
    signature = 'v hypot_u35 v v'
    sleef_symbol_prefix = 'nsimd_sleef_hypot_u35'
    categories = [DocBasicArithmetic]
    desc = 'Compute the Euclidean distance of its argument with a ' \
           'precision of 3.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Remainder(SrcOperator):
    full_name = 'floating-point remainder'
    signature = 'v remainder v v'
    sleef_symbol_prefix = 'nsimd_sleef_remainder'
    domain = [[1, 20], [1, 20]]
    categories = [DocBasicArithmetic]
    desc = 'Compute the floating-point remainder of its arguments. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Fmod(SrcOperator):
    full_name = 'floating-point remainder'
    signature = 'v fmod v v'
    sleef_symbol_prefix = 'nsimd_sleef_fmod'
    domain = [[1, 20], [1, 20]]
    categories = [DocBasicArithmetic]
    desc = 'Compute the floating-point remainder of its argument. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Lgamma_u10(SrcOperator):
    full_name = 'log gamma'
    signature = 'v lgamma_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_lgamma_u10'
    domain = [[0.5, 20]]
    categories = [DocExpLog]
    desc = 'Compute the log gamma of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Tgamma_u10(SrcOperator):
    full_name = 'true gamma'
    signature = 'v tgamma_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_tgamma_u10'
    domain = [[0.5, 5]]
    categories = [DocExpLog]
    desc = 'Compute the true gamma of its argument with a precision of ' \
           '1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Erf_u10(SrcOperator):
    full_name = 'complementary error'
    signature = 'v erf_u10 v'
    sleef_symbol_prefix = 'nsimd_sleef_erf_u10'
    categories = [DocExpLog]
    desc = 'Compute the complementary error of its argument with a ' \
           'precision of 1.0 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'

class Erfc_u15(SrcOperator):
    full_name = 'complementary error'
    signature = 'v erfc_u15 v'
    sleef_symbol_prefix = 'nsimd_sleef_erfc_u15'
    categories = [DocExpLog]
    desc = 'Compute the complementary error of its argument with a ' \
           'precision of 1.5 ulps. ' \
           'For more informations visit <https://sleef.org/purec.xhtml>.'


================================================
FILE: egg/platform_arm.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# This file gives the implementation of platform ARM, i.e. ARM SIMD.
# Reading this file is rather straightforward. ARM SIMD extensions are rather
# coherent and consistent. It implements the following architectures:
#   - ARMv7   -> 128 bits registers without f16 and f64 support
#   - Aarch32 -> 128 bits registers with optional f16 and without f64 support
#   - Aarch64 -> 128 bits registers with optional f16 and f64 support
#   - SVE     -> up to 2048 bits registers
# The first three SIMD extensions are collectively called NEON. Aarch32 and
# Aarch64 correspond respectively to ARMv8 32 and 64 bits chips. Note that
# the ARM documentation says that ARMv7, Aarch32 are different but it seems
# that they differ by only a handful of intrinsics which are not in the scope
# of NSIMD so we have implemented the following:
#
#   - ARMv7   \  -> neon128
#   - Aarch32 /
#   - Aarch64    -> aarch64
#   - SVE        -> sve

import common

# -----------------------------------------------------------------------------
# Helpers


def neon_typ(typ):
    prefix = {'i': 'int', 'u': 'uint', 'f': 'float'}
    return '{}{}x{}_t'.format(prefix[typ[0]], typ[1:], 128 // int(typ[1:]))

def half_neon64_typ(typ):
    prefix = {'i': 'int', 'u': 'uint', 'f': 'float'}
    return '{}{}x{}_t'.format(prefix[typ[0]], typ[1:], 64 // int(typ[1:]))


def sve_typ(typ):
    prefix = {'i': 'svint', 'u': 'svuint', 'f': 'svfloat'}
    return '{}{}_t'.format(prefix[typ[0]], typ[1:])

def suf(typ):
    if typ[0] == 'i':
        return 's{}'.format(typ[1:])
    else:
        return typ

neon = ['neon128', 'aarch64']
fixed_sized_sve = ['sve128', 'sve256', 'sve512', 'sve1024', 'sve2048']
sve = ['sve'] + fixed_sized_sve
fmtspec = {}

def convert_from_predicate(opts, op):
    if opts.sve_emulate_bool:
        return '''svsel({op},
                    svdup_n_u{typnbits}_x({svtrue}, (u{typnbits})~0),
                    svdup_n_u{typnbits}_x({svtrue}, 0))'''. \
                            format(op=op, **fmtspec)
    else:
        return op

def convert_to_predicate(opts, op):
    if opts.sve_emulate_bool:
        # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve
        # it needs to be deleted when the bug is corrected
        return '''svcmpeq({svtrue}, (svuint{typnbits}_t){op},
                          svdup_n_u{typnbits}_x({svtrue},
                          (u{typnbits})~0))'''.format(op=op, **fmtspec)
    else:
        return op

# -----------------------------------------------------------------------------
# Implementation of mandatory functions for this module

def get_simd_exts():
    return ['neon128', 'aarch64', 'sve', 'sve128', 'sve256', 'sve512',
            'sve1024', 'sve2048']

def get_prev_simd_ext(simd_ext):
    if simd_ext in ['neon128', 'aarch64']:
        return 'cpu'
    elif simd_ext in sve:
        return 'aarch64'
    raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

def emulate_fp16(simd_ext):
    if not simd_ext in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if simd_ext in sve:
        return False
    else:
        return True

def get_type(opts, simd_ext, typ, nsimd_typ):
    if simd_ext in neon:
        if typ == 'f64':
            if simd_ext == 'neon128':
                return 'typedef struct {{ double v0; double v1; }} {};'. \
                       format(nsimd_typ)
            else:
                return 'typedef {} {};'.format(neon_typ('f64'), nsimd_typ)
        elif typ == 'f16':
            return '''
                   #ifdef NSIMD_ARM_FP16
                     typedef float16x8_t {nsimd_typ};
                   #else
                     typedef struct {{ float32x4_t v0; float32x4_t v1; }}
                         {nsimd_typ};
                   #endif
                   '''.format(nsimd_typ=nsimd_typ) # extra \n are necessary
        else:
            return 'typedef {} {};'.format(neon_typ(typ), nsimd_typ)
    elif simd_ext == 'sve':
        return 'typedef {} {};'.format(sve_typ(typ), nsimd_typ)
    elif simd_ext in fixed_sized_sve:
        return 'typedef {} {} __attribute__((arm_sve_vector_bits({})));'. \
               format(sve_typ(typ), nsimd_typ, simd_ext[3:])
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

def get_logical_type(opts, simd_ext, typ, nsimd_typ):
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    if simd_ext not in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

    if typ in common.ftypes + common.itypes:
        typ2 = 'u{}'.format(typ[1:]);
    else:
        typ2 = typ

    if simd_ext == 'neon128':
        if typ == 'f16':
            return \
            '''
            #ifdef NSIMD_ARM_FP16
              typedef uint16x8_t {nsimd_typ};
            #else
              typedef struct {{ uint32x4_t v0; uint32x4_t v1; }} {nsimd_typ};
            #endif
            '''.format(nsimd_typ=nsimd_typ) # extra \n are necessary
        elif typ == 'f64':
            return 'typedef struct {{ u64 v0; u64 v1; }} {};'.format(nsimd_typ)
        else:
            return get_type(opts, simd_ext, typ2, nsimd_typ)
    if simd_ext == 'aarch64':
        if typ == 'f16':
            return get_logical_type(opts, 'neon128', 'f16', nsimd_typ)
        else:
            return get_type(opts, simd_ext, typ2, nsimd_typ)
    elif simd_ext in sve:
        if opts.sve_emulate_bool:
            return get_type(opts, simd_ext, 'u' + typ[1:], nsimd_typ)
        elif simd_ext in fixed_sized_sve:
            return \
            'typedef svbool_t {} __attribute__((arm_sve_vector_bits({})));'. \
            format(nsimd_typ, simd_ext[3:])
        else:
            return 'typedef svbool_t {};'.format(nsimd_typ)

def get_nb_registers(simd_ext):
    if simd_ext in neon:
        return '16'
    elif simd_ext in sve:
        return '32'
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))


def get_native_soa_typ(simd_ext, typ, deg):
    prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }[typ[0]]
    if simd_ext in sve:
        return 'sv{}x{}_t'.format(prefix + typ[1:], deg)
    else:
        return '{}{}x{}x{}_t'.format(prefix, typ[1:], 128 // int(typ[1:]),
                                     deg)


def get_SoA_type(simd_ext, typ, deg, nsimd_typ):
    if simd_ext != 'sve':
        raise ValueError('SIMD extension must be "sve"')
    prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }[typ[0]]
    return 'typedef {} {};'.format(get_native_soa_typ(simd_ext, typ, deg),
                                   nsimd_typ)


def has_compatible_SoA_types(simd_ext):
    if simd_ext not in neon + sve:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return False

# -----------------------------------------------------------------------------

def get_additional_include(func, platform, simd_ext):
    ret = '''#include <nsimd/cpu/cpu/{}.h>
             '''.format(func)
    if simd_ext in sve:
        ret += '''#include <nsimd/arm/aarch64/{}.h>
                  '''.format(func)
    if func in ['load2u', 'load3u', 'load4u', 'load2a', 'load3a', 'load4a']:
        deg = func[4]
        ret += '''#if NSIMD_CXX > 0
                  extern "C" {{
                  #endif

                  NSIMD_INLINE nsimd_{simd_ext}_vu16x{deg}
                  nsimd_{func}_{simd_ext}_u16(const u16*);

                  # if NSIMD_CXX > 0
                  }} // extern "C"
                  #endif

                  '''.format(func=func, deg=deg, simd_ext=simd_ext)
    if func in ['mask_storea1', 'mask_storeu1', 'masko_loada1',
                'masko_loadu1', 'maskz_loada1', 'maskz_loadu1'] and \
                simd_ext not in sve:
        ret += '''#include <nsimd/scalar_utilities.h>
                  '''
    if func == 'mask_for_loop_tail' and simd_ext not in sve:
        ret += '''#include <nsimd/arm/{simd_ext}/set1.h>
                  #include <nsimd/arm/{simd_ext}/set1l.h>
                  #include <nsimd/arm/{simd_ext}/iota.h>
                  #include <nsimd/arm/{simd_ext}/lt.h>
                  '''.format(simd_ext=simd_ext)
    if simd_ext == 'neon128' and func == 'notl':
        ret += '''#include <nsimd/arm/neon128/notb.h>
                  '''
    if simd_ext in neon and func == 'ne':
        ret += '''#include <nsimd/arm/{simd_ext}/eq.h>
                  # include <nsimd/arm/{simd_ext}/notl.h>
                  '''.format(simd_ext=simd_ext)
    if simd_ext in neon and func in ['fms', 'fnms']:
        ret += '''#include <nsimd/arm/{simd_ext}/ne.h>
                  #include <nsimd/arm/{simd_ext}/fma.h>
                  #include <nsimd/arm/{simd_ext}/fnma.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'shra':
        ret += '''#include <nsimd/arm/{simd_ext}/shr.h>
        '''.format(simd_ext=simd_ext)

    if func in ['loadlu', 'loadla']:
        ret += '''#include <nsimd/arm/{simd_ext}/eq.h>
                  # include <nsimd/arm/{simd_ext}/set1.h>
                  # include <nsimd/arm/{simd_ext}/{load}.h>
                  # include <nsimd/arm/{simd_ext}/notl.h>
                  '''.format(load='load' + func[5], simd_ext=simd_ext)
    if func in ['storelu', 'storela']:
        ret += '''#include <nsimd/arm/{simd_ext}/if_else1.h>
                  # include <nsimd/arm/{simd_ext}/set1.h>
                  # include <nsimd/arm/{simd_ext}/{store}.h>
                  '''.format(store='store' + func[6], simd_ext=simd_ext)
    if func == 'to_logical':
        ret += '''#include <nsimd/arm/{simd_ext}/reinterpret.h>
                  #include <nsimd/arm/{simd_ext}/ne.h>
                  ''' .format(simd_ext=simd_ext)
    if func == 'zip':
        ret += '''#include <nsimd/arm/{simd_ext}/ziplo.h>
                  #include <nsimd/arm/{simd_ext}/ziphi.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'unzip':
        ret += '''#include <nsimd/arm/{simd_ext}/unziplo.h>
                  #include <nsimd/arm/{simd_ext}/unziphi.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'adds':
        ret += '''#include <nsimd/arm/{simd_ext}/add.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'subs':
        ret += '''#include <nsimd/arm/{simd_ext}/sub.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['gather', 'scatter'] and simd_ext == 'sve':
        ret += '''#include <nsimd/arm/sve/len.h>
                  '''
    return ret

# -----------------------------------------------------------------------------
# Emulators

def emulate_op1(op, simd_ext, typ):
    if simd_ext in neon:
        le = 128 // int(typ[1:]);
        return '''int i;
                  {typ} buf[{le}];
                  vst1q_{suf}(buf, {in0});
                  for (i=0; i < {le}; i += nsimd_len_cpu_{typ}()) {{
                    nsimd_storeu_cpu_{typ}( & buf[i], nsimd_{op}_cpu_{typ}(
                      nsimd_loadu_cpu_{typ}(&buf[i])));}}
                  return vld1q_{suf}(buf); '''. \
                  format(op=op, le=le, **fmtspec)
    if simd_ext in sve:
        le = 2048 // int(typ[1:]);
        return '''int i;
                  {typ} buf[{le}];
                  svst1_{suf}({svtrue}, buf, {in0});
                  for (i=0; i < simd_len_{simd_ext}_{typ}();
                       i += nsimd_len_cpu_{typ}()) {{
                    nsimd_storeu_cpu_{typ}( & buf[i], nsimd_{op}_cpu_{typ}(
                      nsimd_loadu_cpu_{typ}(&buf[i])));}}
                  return svld1_{suf}({svtrue}, buf); '''. \
                  format(op=op, le=le, **fmtspec)

def emulate_op2(op, simd_ext, typ):
    if simd_ext in neon:
        le = 128 // int(typ[1:]);
        return '''int i;
                  {typ} buf0[{le}], buf1[{le}];
                  vst1q_{suf}(buf0, {in0});
                  vst1q_{suf}(buf1, {in1});
                  for (i=0; i < {le}; i++) {{
                    buf0[i] = ({typ})(buf0[i] {op} buf1[i]);}}
                  return vld1q_{suf}(buf0); '''. \
                  format(op=op, le=le, **fmtspec)
    if simd_ext in sve:
        le = 2048 // int(typ[1:]);
        return '''int i;
                  {typ} buf0[{le}], buf1[{le}];
                  svst1_{suf}({svtrue}, buf0, {in0});
                  svst1_{suf}({svtrue}, buf1, {in1});
                  for (i=0; i < nsimd_len_{simd_ext}_{typ}(); i++) {{
                    buf0[i] = ({typ})(buf0[i] {op} buf1[i]);}}
                  return svld1_{suf}({svtrue}, buf0); '''. \
                  format(op=op, le=le, **fmtspec)

def emulate_lop2_neon(opts, op, simd_ext, typ):
    le = 128 // int(typ[1:]);
    ltyp = get_logical_type(opts, simd_ext, typ)
    lsuf = suf(ltyp)
    return '''int i;
              {ltyp} buf0[{le}], buf1[{le}];
              vst1q_{lsuf}(buf0, {in0});
              vst1q_{lsuf}(buf1, {in1});
              for (i = 0; i < {le}; i++) {{
                buf0[i] = buf0[i] {op} buf1[i] ? ({ltyp})-1 : 0;
              }}
              return vld1q_{lsuf}(buf0);'''. \
              format(op=op, le=le, ltyp=ltyp, lsuf=lsuf, **fmtspec)

def emulate_op3_neon(op, simd_ext, typ):
    le = 128 // int(typ[1:]);
    return '''int i;
              {typ} buf0[{le}], buf1[{le}], buf2[{le}];
              vst1q_{suf}(buf0, {in0});
              vst1q_{suf}(buf1, {in1});
              vst1q_{suf}(buf2, {in2});
              for (i = 0; i < {le}; i += nsimd_len_cpu_{typ}()) {{
                nsimd_storeu_cpu_{typ}(&buf0[i], nsimd_{op}_cpu_{typ}(
                  nsimd_loadu_cpu_{typ}(&buf0[i]),
                  nsimd_loadu_cpu_{typ}(&buf1[i]),
                  nsimd_loadu_cpu_{typ}(&buf2[i])));
              }}
              return vld1q_{suf}(buf0);'''.format(op=op, le=le, **fmtspec)

def emulate_f64_neon(simd_ext, op, params):
    fmtspec2 = fmtspec.copy()
    fmtspec2['op'] = op
    fmtspec2['buf_ret_decl'] = 'nsimd_cpu_{}f64 buf_ret;'. \
                               format('v' if params[0] == 'v' else 'vl')
    fmtspec2['buf_decl'] = '\n'.join(['nsimd_cpu_{}f64 buf{};'. \
                           format('v' if p[1] == 'v' else 'vl', p[0]) \
                           for p in common.enum(params[1:])])
    fmtspec2['bufs'] = ','.join(['buf{}'.format(i) \
                                 for i in range(0, len(params) - 1)])
    fmtspec2['ret_decl'] = 'nsimd_{}_{}f64 ret;'. \
                           format(simd_ext, 'v' if params[0] == 'v' else 'vl')
    buf_set = '\n'.join('''buf{i}.v0 = {ini}.v0;
                           buf{i}.v1 = {ini}.v1;'''. \
                           format(i=i, ini=fmtspec['in{}'.format(i)]) \
                           for i in range(0, len(params) - 1))
    return '''{buf_ret_decl}
              {buf_decl}
              {ret_decl}
              {buf_set}
              buf_ret = nsimd_{op}_cpu_f64({bufs});
              ret.v0 = buf_ret.v0;
              ret.v1 = buf_ret.v1;
              return ret;'''.format(buf_set=buf_set, **fmtspec2)

# -----------------------------------------------------------------------------

def f16f64(simd_ext, typ, op, armop, arity, forced_intrinsics = ''):
    fmtspec2 = fmtspec.copy()
    tmpl = ', '.join(['{{in{}}}.v{{{{i}}}}'.format(i).format(**fmtspec) \
                      for i in range(0, arity)])
    fmtspec2['args1'] = tmpl.format(i='0')
    fmtspec2['args2'] = tmpl.format(i='1')
    fmtspec2['armop'] = armop
    fmtspec2['op'] = op
    if simd_ext in neon and typ == 'f16':
        if forced_intrinsics != '':
            fmtspec2['intrinsics'] = forced_intrinsics
        else:
            temp = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                              for i in range(0, arity)])
            fmtspec2['intrinsics'] = 'return v{}q_f16({});'.format(armop, temp)
        return '''#ifdef NSIMD_ARM_FP16
                    {intrinsics}
                  #else
                    nsimd_{simd_ext}_vf16 ret;
                    ret.v0 = nsimd_{op}_{simd_ext}_f32({args1});
                    ret.v1 = nsimd_{op}_{simd_ext}_f32({args2});
                    return ret;
                  #endif'''.format(**fmtspec2)
    elif simd_ext == 'neon128' and typ == 'f64':
        return emulate_f64_neon(simd_ext, op, ['v'] * (arity + 1))
    return ''

# -----------------------------------------------------------------------------
# Lenghts

def max_len(simd_ext, typ):
    if simd_ext == 'sve':
        return 2048 // int(typ[1:])
    elif simd_ext in fixed_sized_sve:
        return int(simd_ext[3:]) // int(typ[1:])
    else:
        return 128 // int(typ[1:])

def real_len(simd_ext, typ):
    if simd_ext == 'sve':
        return 'nsimd_len_sve_{typ}()'.format(**fmtspec)
    else:
        return max_len(simd_ext, typ)

# -----------------------------------------------------------------------------
# Loads of degree 1, 2, 3 and 4

def load1234(opts, simd_ext, typ, deg):
    if simd_ext in neon:
        if deg == 1:
            normal = 'return vld{deg}q_{suf}({in0});'. \
                     format(deg=deg, **fmtspec)
            if typ == 'f16':
                return \
                '''#ifdef NSIMD_ARM_FP16
                     {normal}
                   #else
                     /* Note that we can do much better but is it useful? */
                     nsimd_{simd_ext}_vf16 ret;
                     f32 buf[4];
                     buf[0] = nsimd_u16_to_f32(*(u16*){in0});
                     buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 1));
                     buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 2));
                     buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 3));
                     ret.v0 = vld1q_f32(buf);
                     buf[0] = nsimd_u16_to_f32(*((u16*){in0} + 4));
                     buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 5));
                     buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 6));
                     buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 7));
                     ret.v1 = vld1q_f32(buf);
                     return ret;
                   #endif'''.format(normal=normal, **fmtspec)
            elif typ == 'f64' and simd_ext == 'neon128':
                return \
                '''nsimd_neon128_vf64 ret;
                   ret.v0 = *{in0};
                   ret.v1 = *({in0} + 1);
                   return ret;'''.format(**fmtspec)
            else:
                return normal
        else:
            normal = \
            '''nsimd_{simd_ext}_v{typ}x{deg} ret;
               {soa_typ} buf = vld{deg}q_{suf}({in0});
               {assignment}
               return ret;'''. \
               format(deg=deg, soa_typ=get_native_soa_typ(simd_ext, typ, deg),
                      assignment='\n'.join(['ret.v{i} = buf.val[{i}];'. \
                      format(i=i) for i in range(0, deg)]), **fmtspec)
            if typ == 'f16':
                assignment = \
                '''vst1q_u16(buf, temp.val[{{i}}]);
                   ret.v{{i}} = nsimd_loadu_{simd_ext}_f16((f16 *)buf);'''. \
                   format(**fmtspec)
                return \
                '''{soa_typ} temp = vld{deg}q_u16((u16 *){in0});
                   u16 buf[8];
                   nsimd_{simd_ext}_vf16x{deg} ret;
                   {assignment}
                   return ret;'''. \
                   format(deg=deg, assignment='\n'.join([assignment. \
                          format(i=i) for i in range(0, deg)]),
                          soa_typ=get_native_soa_typ(simd_ext, 'u16', deg),
                          **fmtspec)
            elif typ in 'f64' and simd_ext == 'neon128':
                return \
                'nsimd_neon128_vf64x{} ret;\n'.format(deg) + \
                '\n'.join(['ret.v{i}.v0 = *({in0} + {i});'. \
                           format(i=i, **fmtspec) for i in range(0, deg)]) + \
                '\n'.join(['ret.v{i}.v1 = *({in0} + {ipd});'. \
                           format(i=i, ipd=i + deg, **fmtspec) \
                           for i in range(0, deg)]) + \
                '\nreturn ret;\n'
            elif typ in ['i64', 'u64'] and simd_ext == 'neon128':
                return \
                '''nsimd_neon128_v{typ}x{deg} ret;
                   {typ} buf[2];'''.format(deg=deg, **fmtspec) + \
                '\n'.join(['''buf[0] = *({in0} + {i});
                              buf[1] = *({in0} + {ipd});
                              ret.v{i} = vld1q_{suf}(buf);'''. \
                              format(i=i, ipd=i + deg, **fmtspec) \
                              for i in range(0, deg)]) + \
                '\nreturn ret;\n'
            else:
                return normal
    else:
        if deg == 1:
            return 'return svld{deg}_{suf}({svtrue}, {in0});'. \
                   format(deg=deg, **fmtspec)
        else:
            return \
            '''nsimd_{simd_ext}_v{typ}x{deg} ret;
               {sve_typ} buf = svld{deg}_{suf}({svtrue}, {in0});
               {assignment}
               return ret;'''.format(assignment=\
               '\n'.join(['ret.v{i} = svget{deg}_{suf}(buf, {i});'. \
                          format(i=i, deg=deg, **fmtspec) \
                          for i in range(deg)]),
                          sve_typ=get_native_soa_typ('sve', typ, deg),
                          deg=deg, **fmtspec)

# -----------------------------------------------------------------------------
# Mask loads

def maskoz_load(oz, simd_ext, typ):
    if simd_ext in sve:
        return 'return svsel_{suf}({in0}, svld1_{suf}({in0}, {in1}), {oz});'. \
               format(oz='{in2}'.format(**fmtspec) if oz == 'o' \
                      else 'svdup_n_{suf}(({typ})0)'.format(**fmtspec),
                      **fmtspec)
    if typ == 'f64' and simd_ext == 'neon128':
        return '''nsimd_neon128_vf64 ret;
                  if ({in0}.v0) {{
                    ret.v0 = {in1}[0];
                  }} else {{
                    ret.v0 = {oz0};
                  }}
                  if ({in0}.v1) {{
                    ret.v1 = {in1}[1];
                  }} else {{
                    ret.v1 = {oz1};
                  }}
                  return ret;'''.format(
                  oz0 = '0.0f' if oz == 'z' else '{in2}.v0'.format(**fmtspec),
                  oz1 = '0.0f' if oz == 'z' else '{in2}.v1'.format(**fmtspec),
                  **fmtspec)
    le = 128 // int(typ[1:])
    normal = '''int i;
                {typ} buf[{le}];
                u{typnbits} mask[{le}];
                vst1q_{suf}(buf, {oz});
                vst1q_u{typnbits}(mask, {in0});
                for (i = 0; i < {le}; i++) {{
                  if (mask[i]) {{
                    buf[i] = {in1}[i];
                  }}
                }}
                return vld1q_{suf}(buf);'''. \
                format(oz='vdupq_n_{suf}(({typ})0)'.format(**fmtspec) \
                          if oz == 'z' else '{in2}'.format(**fmtspec),
                          le=le, **fmtspec)
    if typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                    {normal}
                  #else
                    int i;
                    nsimd_{simd_ext}_vf16 ret;
                    f32 buf[8];
                    u32 mask[8];
                    vst1q_f32(buf, {oz0});
                    vst1q_f32(buf + 4, {oz1});
                    vst1q_u32(mask, {in0}.v0);
                    vst1q_u32(mask + 4, {in0}.v1);
                    for (i = 0; i < 8; i++) {{
                      if (mask[i]) {{
                        buf[i] = nsimd_f16_to_f32({in1}[i]);
                      }}
                    }}
                    ret.v0 = vld1q_f32(buf);
                    ret.v1 = vld1q_f32(buf + 4);
                    return ret;
                  #endif'''. \
                  format(oz0='vdupq_n_f32(0.0f)'.format(**fmtspec) \
                             if oz == 'z' else '{in2}.v0'.format(**fmtspec),
                         oz1='vdupq_n_f32(0.0f)'.format(**fmtspec) \
                             if oz == 'z' else '{in2}.v1'.format(**fmtspec),
                             normal=normal, **fmtspec)
    return normal

# -----------------------------------------------------------------------------
# Stores of degree 1, 2, 3 and 4

def store1234(opts, simd_ext, typ, deg):
    if simd_ext in neon:
        if deg == 1:
            normal = 'vst{deg}q_{suf}({in0}, {in1});'. \
                     format(deg=deg, **fmtspec)
            if typ == 'f16':
                return \
                '''#ifdef NSIMD_ARM_FP16
                     {normal}
                   #else
                     f32 buf[4];
                     vst1q_f32(buf, {in1}.v0);
                     *((u16*){in0}    ) = nsimd_f32_to_u16(buf[0]);
                     *((u16*){in0} + 1) = nsimd_f32_to_u16(buf[1]);
                     *((u16*){in0} + 2) = nsimd_f32_to_u16(buf[2]);
                     *((u16*){in0} + 3) = nsimd_f32_to_u16(buf[3]);
                     vst1q_f32(buf, {in1}.v1);
                     *((u16*){in0} + 4) = nsimd_f32_to_u16(buf[0]);
                     *((u16*){in0} + 5) = nsimd_f32_to_u16(buf[1]);
                     *((u16*){in0} + 6) = nsimd_f32_to_u16(buf[2]);
                     *((u16*){in0} + 7) = nsimd_f32_to_u16(buf[3]);
                   #endif'''.format(normal=normal, **fmtspec)
            elif typ == 'f64' and simd_ext == 'neon128':
                return \
                '''*{in0} = {in1}.v0;
                   *({in0} + 1) = {in1}.v1;'''.format(**fmtspec)
            else:
                return normal
        else:
            normal = \
            '''{soa_typ} buf;
               {assignment}
               vst{deg}q_{suf}({in0}, buf);'''. \
               format(deg=deg, assignment='\n'.join([
                      'buf.val[{{}}] = {{in{}}};'.format(i). \
                      format(i - 1, **fmtspec) for i in range(1, deg + 1)]),
                      soa_typ=get_native_soa_typ(simd_ext, typ, deg),
                      **fmtspec)
            if typ == 'f16':
                assignment = \
                '''nsimd_storeu_{{simd_ext}}_f16((f16 *)buf, {{in{}}});
                   temp.val[{{}}] = vld1q_u16(buf);'''
                return \
                '''#ifdef NSIMD_ARM_FP16
                     {normal}
                   #else
                     {soa_typ} temp;
                     u16 buf[8];
                     {assignment}
                     vst{deg}q_u16((u16 *){in0}, temp);
                   #endif'''. \
                   format(assignment='\n'.join([assignment.format(i). \
                          format(i - 1, **fmtspec) \
                          for i in range(1, deg + 1)]),
                          deg=deg, normal=normal,
                          soa_typ=get_native_soa_typ(simd_ext, 'u16', deg),
                          **fmtspec)
            elif typ == 'f64' and simd_ext == 'neon128':
                return \
                '\n'.join(['*({{in0}} + {}) = {{in{}}}.v0;'. \
                           format(i - 1, i).format(**fmtspec) \
                           for i in range(1, deg + 1)]) + '\n' + \
                '\n'.join(['*({{in0}} + {}) = {{in{}}}.v1;'. \
                           format(i + deg - 1, i).format(**fmtspec) \
                           for i in range(1, deg + 1)])
            elif typ in ['i64', 'u64'] and simd_ext == 'neon128':
                return \
                '{typ} buf[{biglen}];'.format(biglen=2 * deg, **fmtspec) + \
                '\n'.join(['vst1q_{{suf}}(buf + {im1x2}, {{in{i}}});'. \
                           format(im1x2=2 * (i - 1), i=i).format(**fmtspec) \
                           for i in range(1, deg + 1)]) + \
                '\n'.join(['''*({in0} + {i}) = buf[{ix2}];
                              *({in0} + {ipd}) = buf[{ix2p1}];'''. \
                              format(i=i, ipd=i + deg, ix2=i * 2,
                                     ix2p1=2 * i + 1, **fmtspec) \
                              for i in range(0, deg)])
            else:
                return normal
    else:
        if deg == 1:
            return 'svst{deg}_{suf}({svtrue}, {in0}, {in1});'. \
                   format(deg=deg, **fmtspec)
        fill_soa_typ = \
            '\n'.join(['tmp = svset{{deg}}_{{suf}}(tmp, {im1}, {{in{i}}});'. \
            format(im1=i - 1, i=i).format(deg=deg, **fmtspec) \
            for i in range(1, deg + 1)])
        return \
        '''{soa_typ} tmp = svundef{deg}_{suf}();
           {fill_soa_typ}
           svst{deg}_{suf}({svtrue}, {in0}, tmp);'''. \
           format(soa_typ=get_native_soa_typ('sve', typ, deg), deg=deg,
                  fill_soa_typ=fill_soa_typ, **fmtspec)

# -----------------------------------------------------------------------------
# Mask stores

def mask_store(simd_ext, typ):
    if simd_ext in sve:
        return 'svst1_{suf}({in0}, {in1}, {in2});'.format(**fmtspec)
    if typ == 'f64' and simd_ext == 'neon128':
        return '''if ({in0}.v0) {{
                    {in1}[0] = {in2}.v0;
                  }}
                  if ({in0}.v1) {{
                    {in1}[1] = {in2}.v1;
                  }}'''.format(**fmtspec)
    le = 128 // int(typ[1:])
    normal = '''int i;
                {typ} buf[{le}];
                u{typnbits} mask[{le}];
                vst1q_{suf}(buf, {in2});
                vst1q_u{typnbits}(mask, {in0});
                for (i = 0; i < {le}; i++) {{
                  if (mask[i]) {{
                    {in1}[i] = buf[i];
                  }}
                }}'''.format(le=le, **fmtspec)
    if typ == 'f16':
        return \
        '''#ifdef NSIMD_ARM_FP16
             {normal}
           #else
             f32 buf[8];
             u32 mask[8];
             int i;
             vst1q_u32(mask, {in0}.v0);
             vst1q_u32(mask + 4, {in0}.v1);
             vst1q_f32(buf, {in2}.v0);
             vst1q_f32(buf + 4, {in2}.v1);
             for (i = 0; i < 8; i++) {{
               if (mask[i]) {{
                 {in1}[i] = nsimd_f32_to_f16(buf[i]);
               }}
             }}
           #endif'''.format(normal=normal, **fmtspec)
    return normal

# -----------------------------------------------------------------------------
# Length

def len1(simd_ext, typ):
    if simd_ext in neon:
        return 'return {};'.format(128 // int(typ[1:]))
    elif simd_ext == 'sve':
        return 'return (int)svcntp_b{typnbits}({svtrue}, {svtrue});'. \
               format(**fmtspec)
    elif simd_ext in fixed_sized_sve:
        return 'return {};'.format(int(simd_ext[3:]) // int(typ[1:]))

# -----------------------------------------------------------------------------
# Add/sub

def addsub(op, simd_ext, typ):
    ret = f16f64(simd_ext, typ, op, op, 2)
    if ret != '':
        return ret
    if simd_ext in neon:
        return 'return v{op}q_{suf}({in0}, {in1});'. \
               format(op=op, **fmtspec)
    else:
        return 'return sv{op}_{suf}_x({svtrue}, {in0}, {in1});'. \
               format(op=op, **fmtspec)

# -----------------------------------------------------------------------------
# Multiplication

def mul2(simd_ext, typ):
    ret = f16f64(simd_ext, typ, 'mul', 'mul', 2)
    if ret != '':
        return ret
    elif simd_ext in neon and typ in ['i64', 'u64']:
        return emulate_op2('*', simd_ext, typ)
    else:
        if simd_ext in neon:
            return 'return vmulq_{suf}({in0}, {in1});'.format(**fmtspec)
        else:
            return 'return svmul_{suf}_x({svtrue}, {in0}, {in1});'. \
                   format(**fmtspec)

# -----------------------------------------------------------------------------
# Division

def div2(simd_ext, typ):
    if simd_ext == 'aarch64' and typ in ['f32', 'f64']:
        return 'return vdivq_{suf}({in0}, {in1});'.format(**fmtspec)
    elif simd_ext in sve and \
         typ in ['f16', 'f32', 'f64', 'i32', 'u32', 'i64', 'u64']:
        return 'return svdiv_{suf}_x({svtrue}, {in0}, {in1});'. \
               format(**fmtspec)
    else:
        ret = f16f64(simd_ext, typ, 'div', 'div', 2)
        if ret != '':
            return ret
    return emulate_op2('/', simd_ext, typ)

# -----------------------------------------------------------------------------
# Binary operators: and, or, xor, andnot

def binop2(op, simd_ext, typ):
    armop = {'orb': 'orr', 'xorb': 'eor', 'andb': 'and', 'andnotb': 'bic'}
    if typ in common.iutypes:
        if simd_ext in neon:
            return 'return v{armop}q_{suf}({in0}, {in1});'. \
                   format(armop=armop[op], **fmtspec)
        else:
            return 'return sv{armop}_{suf}_x({svtrue}, {in0}, {in1});'. \
                   format(armop=armop[op], **fmtspec)
    # From here only float types
    if typ == 'f16':
        intrinsics = \
        '''return vreinterpretq_f16_u16(v{armop}q_u16(vreinterpretq_u16_f16(
                    {in0}), vreinterpretq_u16_f16({in1})));'''. \
                    format(armop=armop[op], **fmtspec)
    else:
        intrinsics = ''
    ret = f16f64(simd_ext, typ, op, armop[op], 2, intrinsics)
    if ret != '':
        return ret
    if simd_ext in neon:
        return \
        '''return vreinterpretq_f{typnbits}_u{typnbits}(v{armop}q_u{typnbits}(
                    vreinterpretq_u{typnbits}_f{typnbits}({in0}),
                      vreinterpretq_u{typnbits}_f{typnbits}({in1})));'''. \
                      format(armop=armop[op], **fmtspec)
    else:
        return \
        '''return svreinterpret_f{typnbits}_u{typnbits}(
                    sv{armop}_u{typnbits}_x({svtrue},
                      svreinterpret_u{typnbits}_f{typnbits}({in0}),
                      svreinterpret_u{typnbits}_f{typnbits}({in1})));'''. \
                      format(armop=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# Binary not

def not1(simd_ext, typ):
    if typ in common.iutypes:
        if simd_ext in neon:
            if typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32']:
                return 'return vmvnq_{suf}({in0});'.format(**fmtspec)
            else:
                return \
                '''return vreinterpretq_{suf}_u32(vmvnq_u32(
                            vreinterpretq_u32_{suf}({in0})));'''. \
                            format(**fmtspec)
        if simd_ext in sve:
            return 'return svnot_{suf}_x({svtrue}, {in0});'.format(**fmtspec)
    # From here only float types
    if typ == 'f16':
        intrinsics = \
        '''return vreinterpretq_f16_u16(vmvnq_u16(vreinterpretq_u16_f16(
                    {in0})));'''.format(**fmtspec)
    else:
        intrinsics = ''
    ret = f16f64(simd_ext, typ, 'notb', 'mvn', 1, intrinsics)
    if ret != '':
        return ret
    if simd_ext in neon:
        return \
        '''return vreinterpretq_{suf}_u32(vmvnq_u32(
                    vreinterpretq_u32_{suf}({in0})));'''. \
                    format(**fmtspec)
    else:
        return \
        '''return svreinterpret_{suf}_u{typnbits}(svnot_u{typnbits}_x(
                    {svtrue}, svreinterpret_u{typnbits}_{suf}({in0})));'''. \
                    format(**fmtspec)

# -----------------------------------------------------------------------------
# Logical operators: and, or, xor, andnot

def lop2(opts, op, simd_ext, typ):
    armop = {'orl': 'orr', 'xorl': 'eor', 'andl': 'and', 'andnotl': 'bic'}
    if simd_ext in neon:
        if typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 return v{armop}q_u16({in0}, {in1});
               #else
                 nsimd_{simd_ext}_vlf16 ret;
                 ret.v0 = v{armop}q_u32({in0}.v0, {in1}.v0);
                 ret.v1 = v{armop}q_u32({in0}.v1, {in1}.v1);
                 return ret;
               #endif'''.format(armop=armop[op], **fmtspec)
        elif simd_ext == 'neon128' and typ == 'f64':
            if op == 'andnotl':
                return '''nsimd_{simd_ext}_vlf64 ret;
                          ret.v0 = {in0}.v0 & (~{in1}.v0);
                          ret.v1 = {in0}.v1 & (~{in1}.v1);
                          return ret;'''.format(**fmtspec)
            else:
                cpuop = {'orl': '|', 'xorl': '^', 'andl': '&'}
                return '''nsimd_{simd_ext}_vlf64 ret;
                          ret.v0 = {in0}.v0 {cpuop} {in1}.v0;
                          ret.v1 = {in0}.v1 {cpuop} {in1}.v1;
                          return ret;'''.format(cpuop=cpuop[op], **fmtspec)
        else:
            return 'return v{armop}q_u{typnbits}({in0}, {in1});'. \
                   format(armop=armop[op], **fmtspec)
    else:
        if opts.sve_emulate_bool:
            # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve
            # it needs to be deleted when the bug is corrected
            return \
            '''return sv{armop}_x({svtrue},
                                  (svuint{typnbits}_t){in0},
                                  (svuint{typnbits}_t){in1});'''. \
            format(armop=armop[op], **fmtspec)
        else:
            return '''return sv{armop}_z({svtrue}, {in0}, {in1});'''. \
            format(armop=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# Logical not

def lnot1(opts, simd_ext, typ):
    if simd_ext in neon:
        if typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 return vmvnq_u16({in0});
               #else
                 nsimd_{simd_ext}_vlf16 ret;
                 ret.v0 = vmvnq_u32({in0}.v0);
                 ret.v1 = vmvnq_u32({in0}.v1);
                 return ret;
               #endif'''.format(**fmtspec)
        elif simd_ext == 'neon128' and typ == 'f64':
            return '''nsimd_neon128_vlf64 ret;
                      ret.v0 = ~{in0}.v0;
                      ret.v1 = ~{in0}.v1;
                      return ret;'''.format(**fmtspec)
        elif typ in ['i64', 'u64', 'f64']:
            return '''return vreinterpretq_u{typnbits}_u32(vmvnq_u32(
                               vreinterpretq_u32_u{typnbits}({in0})));'''. \
                               format(**fmtspec)
        else:
            return 'return vmvnq_u{typnbits}({in0});'.format(**fmtspec)
    elif simd_ext in sve:
        if opts.sve_emulate_bool:
            # TODO: the cast is a workaround to avoid a bug in gcc trunk for sve
            # it needs to be deleted when the bug is corrected
            return 'return svnot_x({svtrue}, (svuint{typnbits}_t){in0});'.format(**fmtspec)
        else:
            return 'return svnot_z({svtrue}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Square root

def sqrt1(simd_ext, typ):
    if simd_ext == 'neon128':
        if typ in 'f16':
            return '''nsimd_neon128_vf16 ret;
                      ret.v0 = nsimd_sqrt_neon128_f32({in0}.v0);
                      ret.v1 = nsimd_sqrt_neon128_f32({in0}.v1);
                      return ret;'''.format(**fmtspec)
        elif typ == 'f64':
            return f16f64('neon128', 'f64', 'sqrt', 'sqrt', 1)
        else:
            return emulate_op1('sqrt', simd_ext, typ)
    elif simd_ext == 'aarch64':
        if typ == 'f16':
            return f16f64('aarch64', 'f16', 'sqrt', 'sqrt', 1)
        else:
            return 'return vsqrtq_{suf}({in0});'.format(**fmtspec)
    else:
        return 'return svsqrt_{suf}_x({svtrue}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Shifts

def shl_shr(op, simd_ext, typ):
    if simd_ext in neon:
        sign = '-' if op == 'shr' else ''
        if typ in common.utypes:
            return '''return vshlq_{suf}({in0}, vdupq_n_s{typnbits}(
                                 (i{typnbits})({sign}{in1})));'''. \
                                 format(sign=sign, **fmtspec)
        else:
            return \
            '''return vreinterpretq_s{typnbits}_u{typnbits}(vshlq_u{typnbits}(
                        vreinterpretq_u{typnbits}_s{typnbits}({in0}),
                          vdupq_n_s{typnbits}((i{typnbits})({sign}{in1}))));'''. \
                          format(sign=sign, **fmtspec)
    else:
       armop = 'lsl' if op == 'shl' else 'lsr'
       if op == 'shr' and typ in common.itypes:
           return \
           '''return svreinterpret_{suf}_{suf2}(sv{armop}_{suf2}_x({svtrue},
                       svreinterpret_{suf2}_{suf}({in0}),
                       svdup_n_u{typnbits}((u{typnbits}){in1})));'''. \
                       format(suf2=common.bitfield_type[typ], armop=armop,
                              **fmtspec)
       else:
           return '''return sv{armop}_{suf}_x({svtrue}, {in0},
                              svdup_n_u{typnbits}((u{typnbits}){in1}));'''. \
                              format(armop=armop, **fmtspec)

def shra(simd_ext, typ):
    if typ in common.utypes:
        return '''return nsimd_shr_{simd_ext}_{typ}({in0}, {in1});'''. \
                format(**fmtspec)

    if simd_ext in neon:
        return  '''return vshlq_{suf}(
        {in0}, vdupq_n_s{typnbits}((i{typnbits})-{in1}));'''.\
            format(**fmtspec)
    elif simd_ext in sve:
        if typ[0] == 'i':
            return '''return svasr_n_{suf}_x({svtrue}, {in0},
                (u{typnbits}){in1});'''.\
                format(**fmtspec)
        elif typ[0] == 'u':
            return 'return svlsl_n_{suf}_x({svtrue}, {in0}, (u64){in1});'.\
                format(**fmtspec)

# -----------------------------------------------------------------------------
# Set1

def set1(simd_ext, typ):
    if simd_ext in neon:
        if typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return vdupq_n_f16({in0});
                      #else
                        nsimd_{simd_ext}_vf16 ret;
                        f32 f = nsimd_f16_to_f32({in0});
                        ret.v0 = nsimd_set1_{simd_ext}_f32(f);
                        ret.v1 = nsimd_set1_{simd_ext}_f32(f);
                        return ret;
                      #endif'''.format(**fmtspec)
        elif simd_ext == 'neon128' and typ == 'f64':
            return '''nsimd_neon128_vf64 ret;
                      ret.v0 = {in0};
                      ret.v1 = {in0};
                      return ret;'''.format(**fmtspec)
        else:
            return 'return vdupq_n_{suf}({in0});'.format(**fmtspec)
    else:
        return 'return svdup_n_{suf}({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Set1l

def lset1(simd_ext, typ):
    if simd_ext in sve:
        return '''if ({in0}) {{
                    return svptrue_b{typnbits}();
                  }} else {{
                    return svpfalse_b();
                  }}'''.format(**fmtspec)
    # getting here means no NEON and AARCH64 only
    mask = 'vdupq_n_u{typnbits}((u{typnbits}){{}})'.format(**fmtspec)
    normal = '''if ({in0}) {{
                  return {ones};
                }} else {{
                  return {zeros};
                }}'''.format(ones=mask.format('-1'), zeros=mask.format('0'),
                             **fmtspec)
    if typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                    {normal}
                  #else
                    nsimd_{simd_ext}_vlf16 ret;
                    ret.v0 = nsimd_set1l_{simd_ext}_f32({in0});
                    ret.v1 = ret.v0;
                    return ret;
                  #endif'''.format(normal=normal, **fmtspec)
    if typ == 'f64' and simd_ext == 'neon128':
        return '''nsimd_neon128_vlf64 ret;
                  ret.v0 = (u64)({in0} ? -1 : 0);
                  ret.v1 = ret.v0;
                  return ret;'''.format(**fmtspec)
    return normal

# -----------------------------------------------------------------------------
# Comparison operators: ==, <, <=, >, >=

def cmp2(opts, op, simd_ext, typ):
    binop = {'eq': '==', 'lt': '<', 'le': '<=', 'gt': '>', 'ge': '>='}
    armop = {'eq': 'eq', 'lt': 'lt', 'le': 'le', 'gt': 'gt', 'ge': 'ge'}
    if simd_ext in neon:
        emul_f16 = '''nsimd_{simd_ext}_vlf16 ret;
                      ret.v0 = nsimd_{op}_{simd_ext}_f32({in0}.v0, {in1}.v0);
                      ret.v1 = nsimd_{op}_{simd_ext}_f32({in0}.v1, {in1}.v1);
                      return ret;'''.format(op=op, **fmtspec)
        normal = 'return vc{armop}q_{suf}({in0}, {in1});'. \
                 format(armop=armop[op], **fmtspec)
        if typ == 'f16':
            if simd_ext == 'neon128':
                return emul_f16
            else:
                return \
                '''#ifdef NSIMD_ARM_FP16
                     {}
                   #else
                     {}
                   #endif'''.format(normal, emul_f16)
        if simd_ext == 'neon128' and typ == 'f64':
            return '''nsimd_{simd_ext}_vl{typ} ret;
                      ret.v0 = {in0}.v0 {op} {in1}.v0 ? (u64)-1 : 0;
                      ret.v1 = {in0}.v1 {op} {in1}.v1 ? (u64)-1 : 0;
                      return ret;'''.format(op=binop[op], **fmtspec)
        elif simd_ext == 'neon128' and typ in ['i64', 'u64']:
            return '''{typ} buf0[2], buf1[2];
                      u64 ret[2];
                      vst1q_{suf}(buf0, {in0});
                      vst1q_{suf}(buf1, {in1});
                      ret[0] = buf0[0] {op} buf1[0] ? (u64)-1 : 0;
                      ret[1] = buf0[1] {op} buf1[1] ? (u64)-1 : 0;
                      return vld1q_u64(ret);'''. \
                      format(op=binop[op], **fmtspec)
        else:
            return normal
    elif simd_ext in sve:
        if opts.sve_emulate_bool:
            # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve
            # it needs to be deleted when the bug is corrected
            comp = 'svcmp{op}_{suf}({svtrue}, ({svetyp}){in0}, ({svetyp}){in1})'. \
                format(op=armop[op], **fmtspec)
            return 'return {};'.format(convert_from_predicate(opts, comp))
        else:
            return 'return svcmp{op}_{suf}({svtrue}, {in0}, {in1});'. \
                    format(op=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# Not equal

def neq2(opts, simd_ext, typ):
    if simd_ext in neon:
        return '''return nsimd_notl_{simd_ext}_{typ}(
                      nsimd_eq_{simd_ext}_{typ}({in0}, {in1}));'''. \
                      format(**fmtspec)
    elif simd_ext in sve:
        comp='svcmpne_{suf}({svtrue}, {in0}, {in1})'. \
                format(**fmtspec)
        return 'return {};'.format(convert_from_predicate(opts, comp))


# -----------------------------------------------------------------------------
# If_else

def if_else3(opts, simd_ext, typ):
    if simd_ext in neon:
        intrinsic = 'return vbslq_{suf}({in0}, {in1}, {in2});'. \
                    format(**fmtspec)
        if typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 {intrinsic}
               #else
                 nsimd_{simd_ext}_vf16 ret;
                 ret.v0 = nsimd_if_else1_{simd_ext}_f32(
                            {in0}.v0, {in1}.v0, {in2}.v0);
                 ret.v1 = nsimd_if_else1_{simd_ext}_f32(
                            {in0}.v1, {in1}.v1, {in2}.v1);
                 return ret;
               #endif'''.format(intrinsic=intrinsic, **fmtspec)
        elif simd_ext == 'neon128' and typ == 'f64':
            return '''nsimd_neon128_vf64 ret;
                      ret.v0 = {in0}.v0 != 0u ? {in1}.v0 : {in2}.v0;
                      ret.v1 = {in0}.v1 != 0u ? {in1}.v1 : {in2}.v1;
                      return ret;'''.format(**fmtspec)
        else:
            return intrinsic
    elif simd_ext in sve:
        if opts.sve_emulate_bool:
            # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve
            # it needs to be deleted when the bug is corrected
            return 'return svsel_{suf}({cond}, ({svetyp}){in1}, ({svetyp}){in2});' \
                    .format(cond=convert_to_predicate(opts,
                                '{in0}'.format(**fmtspec)),
                            **fmtspec)
        else:
            return 'return svsel_{suf}({in0}, {in1}, {in2});' \
                    .format(**fmtspec)

# -----------------------------------------------------------------------------
# Minimum and maximum

def minmax2(op, simd_ext, typ):
    ret = f16f64(simd_ext, typ, op, op, 2)
    if ret != '':
        return ret
    if simd_ext in neon:
        if typ in ['i64', 'u64']:
            binop = '<' if op == 'min' else '>'
            return '''{typ} buf0[2], buf1[2];
                      vst1q_{suf}(buf0, {in0});
                      vst1q_{suf}(buf1, {in1});
                      buf0[0] = buf0[0] {binop} buf1[0] ? buf0[0] : buf1[0];
                      buf0[1] = buf0[1] {binop} buf1[1] ? buf0[1] : buf1[1];
                      return vld1q_{suf}(buf0);'''. \
                      format(binop=binop, **fmtspec)
        else:
            return 'return v{op}q_{suf}({in0}, {in1});'. \
                   format(op=op, **fmtspec)
    else:
        return 'return sv{op}_{suf}_x({svtrue}, {in0}, {in1});'. \
               format(op=op, **fmtspec)

# -----------------------------------------------------------------------------
# Abs

def abs1(simd_ext, typ):
    if typ in common.utypes:
        return 'return {in0};'.format(**fmtspec)
    elif simd_ext in neon:
        if typ == 'f16':
            return f16f64(simd_ext, 'f16', 'abs', 'abs', 1)
        elif (typ in ['i8', 'i16', 'i32', 'f32']) or \
             (simd_ext == 'aarch64' and typ in ['i64', 'f64']):
            return 'return vabsq_{suf}({in0});'.format(**fmtspec)
        elif typ == 'i64':
            return emulate_op1('abs', 'neon128', 'i64')
        else:
            return f16f64(simd_ext, 'f64', 'abs', 'abs', 1)
    else:
        return 'return svabs_{suf}_x({svtrue}, {in0});'. \
               format(**fmtspec)

# -----------------------------------------------------------------------------
# Round, trunc, ceil and round_to_even

def round1(op, simd_ext, typ):
    if typ in common.iutypes:
        return 'return {in0};'.format(**fmtspec)
    armop = {'floor': 'rndm', 'ceil': 'rndp', 'trunc': 'rnd',
             'round_to_even': 'rndn'}
    if simd_ext == 'neon128':
        ret = f16f64('neon128', typ, op, 'v{armop}q_{suf}'. \
                     format(armop=armop, **fmtspec), 1)
        if ret != '':
            return ret
        return emulate_op1(op, 'neon128', typ);
    elif simd_ext == 'aarch64':
        if typ == 'f16':
            return f16f64('aarch64', 'f16', op, armop[op], 1)
        else:
            return 'return v{armop}q_{suf}({in0});'. \
                   format(armop=armop[op], **fmtspec)
    else:
        armop = {'floor': 'rintm', 'ceil': 'rintp', 'trunc': 'rintz',
                 'round_to_even': 'rintn'}
        return 'return sv{armop}_{suf}_x({svtrue}, {in0});'. \
               format(armop=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# FMA and FNMA

def fmafnma3(op, simd_ext, typ):
    if typ in common.ftypes and simd_ext == 'aarch64':
        armop = {'fma': 'fma', 'fnma': 'fms'}
    else:
        armop = {'fma': 'mla', 'fnma': 'mls'}
    if simd_ext in neon:
        normal = 'return v{armop}q_{suf}({in2}, {in1}, {in0});'. \
                 format(armop=armop[op], **fmtspec)
        emul = emulate_op3_neon(op, simd_ext, typ)
        if typ == 'f16':
            using_f32 = \
            '''nsimd_{simd_ext}_vf16 ret;
               ret.v0 = nsimd_{op}_{simd_ext}_f32({in0}.v0, {in1}.v0, {in2}.v0);
               ret.v1 = nsimd_{op}_{simd_ext}_f32({in0}.v1, {in1}.v1, {in2}.v1);
               return ret;'''.format(op=op, **fmtspec)
            if simd_ext == 'aarch64':
                return \
                '''#ifdef NSIMD_ARM_FP16
                     {}
                   #else
                     {}
                   #endif'''.format(emul, using_f32)
            else:
                return using_f32
        elif simd_ext == 'neon128' and typ == 'f64':
            return emulate_f64_neon('neon128', op, ['v'] * 4)
        elif simd_ext == 'aarch64' and typ == 'f64':
            return normal
        elif typ in ['i64', 'u64']:
            return emul
        else:
            return normal
    else:
        return 'return sv{armop}_{suf}_x({svtrue}, {in2}, {in1}, {in0});'. \
               format(armop=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# FMS and FNMS

def fmsfnms3(op, simd_ext, typ):
    if typ in common.iutypes:
        return \
        '''return nsimd_neg_{simd_ext}_{typ}(nsimd_{op2}_{simd_ext}_{typ}(
                      {in0}, {in1}, {in2}));'''. \
                      format(op2='fma' if op == 'fnms' else 'fnma', **fmtspec)
    if simd_ext in neon:
        return \
        '''return nsimd_{op2}_{simd_ext}_{typ}({in0}, {in1},
                      nsimd_neg_{simd_ext}_{typ}({in2}));'''. \
                      format(op2='fma' if op == 'fms' else 'fnma', **fmtspec)
    else:
        armop = {'fnms': 'nmla', 'fms': 'nmls'}
        return 'return sv{armop}_{suf}_x({svtrue}, {in2}, {in1}, {in0});'. \
               format(armop=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# Neg

def neg1(simd_ext, typ):
    if simd_ext in neon:
        normal = 'return vnegq_{suf}({in0});'.format(**fmtspec)
        if typ == 'f16':
            return f16f64(simd_ext, 'f16', 'neg', 'neg', 1)
        elif typ in ['i8', 'i16', 'i32', 'f32']:
            return normal
        elif typ in ['u8', 'u16', 'u32']:
            return \
            '''return vreinterpretq_{suf}_s{typnbits}(
                        vnegq_s{typnbits}(
                          vreinterpretq_s{typnbits}_{suf}({in0})));'''. \
                          format(**fmtspec)
        elif simd_ext == 'neon128' and typ in ['i64', 'u64']:
            return emulate_op1('neg', simd_ext, typ)
        elif simd_ext == 'neon128' and typ == 'f64':
            return \
            '''nsimd_neon128_vf64 ret;
               ret.v0 = -{in0}.v0;
               ret.v1 = -{in0}.v1;
               return ret;'''.format(**fmtspec)
        elif simd_ext == 'aarch64' and typ in ['f64', 'i64']:
            return normal
        elif simd_ext == 'aarch64' and typ == 'u64':
            return \
            '''return vreinterpretq_u64_s64(vnegq_s64(
                          vreinterpretq_s64_u64({in0})));'''. \
                          format(**fmtspec)
    else:
        if typ in common.utypes:
            return \
            '''return svreinterpret_{suf}_s{typnbits}(
                        svneg_s{typnbits}_x({svtrue},
                          svreinterpret_s{typnbits}_{suf}({in0})));'''. \
                          format(**fmtspec)
        else:
            return 'return svneg_{suf}_x({svtrue}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Reciprocals

def recs1(op, simd_ext, typ):
    cte = '({typ})1'.format(**fmtspec) if typ != 'f16' \
          else 'nsimd_f32_to_f16(1.0f)'
    if op in ['rec', 'rec11']:
        return \
        '''return nsimd_div_{simd_ext}_{typ}(
                      nsimd_set1_{simd_ext}_{typ}({cte}), {in0});'''. \
                      format(cte=cte, **fmtspec)
    elif op == 'rsqrt11':
        return \
        '''return nsimd_div_{simd_ext}_{typ}(
                      nsimd_set1_{simd_ext}_{typ}({cte}),
                      nsimd_sqrt_{simd_ext}_{typ}({in0}));'''. \
                      format(cte=cte, **fmtspec)
    elif op in ['rec8', 'rsqrt8']:
        armop = 'recpe' if op == 'rec8' else 'rsqrte'
        if simd_ext in sve:
            return 'return sv{armop}_{suf}({in0});'. \
            format(armop=armop, **fmtspec)
        else:
            ret = f16f64(simd_ext, typ, op, armop, 1)
            if ret != '':
                return ret
            return 'return v{armop}q_{suf}({in0});'. \
            format(armop=armop, **fmtspec)

# Rec11 and rsqrt11
# According to http://infocenter.arm.com/help/topic/com.arm.doc.faqs/ka14282.html
# reciprocal estimates only work when inputs is restrained in some small
# interval so we comment these for now and return full-precision reciprocals.

# def rec11rsqrt11(op, simd_ext, typ):
#    armop = {'rec11': 'recpe', 'rsqrt11': 'rsqrte'}
#    if simd_ext in neon:
#        ret = f16f64(simd_ext, typ, op, armop[op], 1)
#        if ret != '':
#            return ret
#        return 'return v{armop}q_{suf}({in0});'. \
#               format(armop=armop[op], **fmtspec)
#    else:
#        return 'return sv{armop}_{suf}({in0});'. \
#               format(armop=armop[op], **fmtspec)

# -----------------------------------------------------------------------------
# Load of logicals

def loadl(aligned, simd_ext, typ):
    return \
    '''/* This can surely be improved but it is not our priority. */
       return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}(
                nsimd_load{align}_{simd_ext}_{typ}(
                  {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \
       format(align='a' if aligned else 'u',
              zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'
              else '({})0'.format(typ), **fmtspec)

# -----------------------------------------------------------------------------
# Store of logicals

def storel(aligned, simd_ext, typ):
    return \
    '''/* This can surely be improved but it is not our priority. */
       nsimd_store{align}_{simd_ext}_{typ}({in0},
         nsimd_if_else1_{simd_ext}_{typ}({in1},
           nsimd_set1_{simd_ext}_{typ}({one}),
           nsimd_set1_{simd_ext}_{typ}({zero})));'''. \
       format(align = 'a' if aligned else 'u',
              one = 'nsimd_f32_to_f16(1.0f)' if typ == 'f16'
              else '({})1'.format(typ),
              zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'
              else '({})0'.format(typ), **fmtspec)

# -----------------------------------------------------------------------------
# All and any

def allany1(opts, op, simd_ext, typ):
    binop = '&&' if  op == 'all' else '||'
    if simd_ext == 'neon128':
        if typ == 'f16':
            return \
            '''return nsimd_{op}_neon128_f32({in0}.v0) {binop}
                      nsimd_{op}_neon128_f32({in0}.v1);'''. \
                      format(op=op, binop=binop, **fmtspec)
        elif typ == 'f64':
            return 'return {in0}.v0 {binop} {in0}.v1;'. \
                   format(binop=binop, **fmtspec)
        else:
            return 'return ' + \
            binop.join(['vgetq_lane_u{typnbits}({in0}, {i})'. \
                        format(i=i, **fmtspec) \
                        for i in range(0, 128 // int(fmtspec['typnbits']))]) + \
                        ';'
    elif simd_ext == 'aarch64':
        armop = {'all': 'min', 'any': 'max'}
        normal = 'return v{armop}vq_u{typnbits}({in0}) != 0;'. \
                 format(armop=armop[op], **fmtspec)
        if typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 {normal}
               #else
                 return nsimd_{op}_aarch64_f32({in0}.v0) {binop}
                        nsimd_{op}_aarch64_f32({in0}.v1);
               #endif'''.format(normal=normal, op=op, binop=binop, **fmtspec)
        elif typ in ['i64', 'u64', 'f64']:
            return \
            'return v{armop}vq_u32(vreinterpretq_u32_u64({in0})) != 0;'. \
            format(armop=armop[op], **fmtspec)
        else:
            return normal
    elif simd_ext in sve:
        if op == 'any':
            operand= convert_to_predicate(opts, '{in0}'.format(**fmtspec))
            return '''return svptest_any({svtrue}, {operand});'''. \
                    format(operand=operand, **fmtspec)
        else:
            operand='svnot_z({svtrue}, {op})'. \
            format(op=convert_to_predicate(opts, '{in0}'.format(**fmtspec)),
                   **fmtspec)

            return '''return !svptest_any({svtrue}, {operand});'''. \
                    format(operand=operand, **fmtspec)

# -----------------------------------------------------------------------------
# nbtrue

def nbtrue1(opts, simd_ext, typ):
    if simd_ext == 'neon128':
        if typ == 'f16':
            return \
            '''return nsimd_nbtrue_neon128_f32({in0}.v0) +
                      nsimd_nbtrue_neon128_f32({in0}.v1);'''. \
                      format(**fmtspec)
        elif typ == 'f64':
            return 'return -(int)((i64){in0}.v0 + (i64){in0}.v1);'. \
                   format(**fmtspec)
        else:
            return \
            '''nsimd_neon128_vi{typnbits} temp =
                   vreinterpretq_s{typnbits}_u{typnbits}({in0});
               return -(int)('''.format(**fmtspec) + \
            '+'.join(['vgetq_lane_s{typnbits}(temp, {i})'. \
                      format(i=i, **fmtspec) \
                      for i in range(0, 128 // int(fmtspec['typnbits']))]) + \
                      ');'
    elif simd_ext == 'aarch64':
        normal = \
        '''return -(int)vaddvq_s{typnbits}(
                          vreinterpretq_s{typnbits}_u{typnbits}({in0}));'''. \
                     format(**fmtspec)
        if typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 {normal}
               #else
                 return nsimd_nbtrue_aarch64_f32({in0}.v0) +
                        nsimd_nbtrue_aarch64_f32({in0}.v1);
               #endif'''.format(normal=normal, **fmtspec)
        elif typ in ['i64', 'u64', 'f64']:
            return \
            '''return -(vaddvq_s32(vreinterpretq_s32_u64({in0})) >> 1);'''. \
                         format(**fmtspec)
        else:
            return normal
    elif simd_ext in sve:
        return 'return (int)svcntp_b{typnbits}({svtrue}, {op});'. \
               format(op=convert_to_predicate(opts, '{in0}'.format(**fmtspec)),
                      **fmtspec)

# -----------------------------------------------------------------------------
# Reinterpret logical

def reinterpretl1(simd_ext, from_typ, to_typ):
    if from_typ == to_typ or simd_ext in sve:
        return 'return {in0};'.format(**fmtspec)
    to_f16_with_f32 = \
    '''nsimd_{simd_ext}_vlf16 ret;
       u32 buf[4];
       buf[0] = (vgetq_lane_u16({in0}, 0) ? (u32)-1 : 0);
       buf[1] = (vgetq_lane_u16({in0}, 1) ? (u32)-1 : 0);
       buf[2] = (vgetq_lane_u16({in0}, 2) ? (u32)-1 : 0);
       buf[3] = (vgetq_lane_u16({in0}, 3) ? (u32)-1 : 0);
       ret.v0 = vld1q_u32(buf);
       buf[0] = (vgetq_lane_u16({in0}, 4) ? (u32)-1 : 0);
       buf[1] = (vgetq_lane_u16({in0}, 5) ? (u32)-1 : 0);
       buf[2] = (vgetq_lane_u16({in0}, 6) ? (u32)-1 : 0);
       buf[3] = (vgetq_lane_u16({in0}, 7) ? (u32)-1 : 0);
       ret.v1 = vld1q_u32(buf);
       return ret;'''.format(**fmtspec)
    from_f16_with_f32 = \
    '''u16 buf[8];
       buf[0] = (vgetq_lane_u32({in0}.v0, 0) ? (u16)-1 : 0);
       buf[1] = (vgetq_lane_u32({in0}.v0, 1) ? (u16)-1 : 0);
       buf[2] = (vgetq_lane_u32({in0}.v0, 2) ? (u16)-1 : 0);
       buf[3] = (vgetq_lane_u32({in0}.v0, 3) ? (u16)-1 : 0);
       buf[4] = (vgetq_lane_u32({in0}.v1, 0) ? (u16)-1 : 0);
       buf[5] = (vgetq_lane_u32({in0}.v1, 1) ? (u16)-1 : 0);
       buf[6] = (vgetq_lane_u32({in0}.v1, 2) ? (u16)-1 : 0);
       buf[7] = (vgetq_lane_u32({in0}.v1, 3) ? (u16)-1 : 0);
       return vld1q_u16(buf);'''.format(**fmtspec)
    if simd_ext == 'neon128':
        if to_typ == 'f16':
            return to_f16_with_f32
        elif from_typ == 'f16':
            return from_f16_with_f32
        elif to_typ == 'f64':
            return '''nsimd_neon128_vlf64 ret;
                      ret.v0 = vgetq_lane_u64({in0}, 0);
                      ret.v1 = vgetq_lane_u64({in0}, 1);
                      return ret;'''.format(**fmtspec)
        elif from_typ == 'f64':
            return '''u64 buf[2];
                      buf[0] = {in0}.v0;
                      buf[1] = {in0}.v1;
                      return vld1q_u64(buf);'''.format(**fmtspec)
        else:
            return 'return {in0};'.format(**fmtspec)
    elif simd_ext == 'aarch64':
        if to_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return {in0};
                      #else
                        {using_f32}
                      #endif'''.format(using_f32=to_f16_with_f32, **fmtspec)
        elif from_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return {in0};
                      #else
                        {using_f32}
                      #endif'''.format(using_f32=from_f16_with_f32, **fmtspec)
        else:
            return 'return {in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Convert

def convert1(simd_ext, from_typ, to_typ):
    fmtspec2 = fmtspec.copy()
    fmtspec2['to_suf'] = suf(to_typ)
    fmtspec2['from_suf'] = suf(from_typ)
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    if from_typ in common.iutypes and to_typ in common.iutypes:
        if simd_ext in neon:
            return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \
                   format(**fmtspec2)
        else:
            return 'return svreinterpret_{to_suf}_{from_suf}({in0});'. \
                   format(**fmtspec2)
    if simd_ext in sve:
        return 'return svcvt_{to_suf}_{from_suf}_x({svtrue}, {in0});'. \
               format(**fmtspec2)
    to_f16_with_f32 = \
    '''nsimd_{simd_ext}_vf16 ret;
       f32 buf[4];
       buf[0] = (f32)vgetq_lane_{from_suf}({in0}, 0);
       buf[1] = (f32)vgetq_lane_{from_suf}({in0}, 1);
       buf[2] = (f32)vgetq_lane_{from_suf}({in0}, 2);
       buf[3] = (f32)vgetq_lane_{from_suf}({in0}, 3);
       ret.v0 = vld1q_f32(buf);
       buf[0] = (f32)vgetq_lane_{from_suf}({in0}, 4);
       buf[1] = (f32)vgetq_lane_{from_suf}({in0}, 5);
       buf[2] = (f32)vgetq_lane_{from_suf}({in0}, 6);
       buf[3] = (f32)vgetq_lane_{from_suf}({in0}, 7);
       ret.v1 = vld1q_f32(buf);
       return ret;'''.format(**fmtspec2)
    from_f16_with_f32 = \
    '''{to_typ} buf[8];
       buf[0] = ({to_typ})vgetq_lane_f32({in0}.v0, 0);
       buf[1] = ({to_typ})vgetq_lane_f32({in0}.v0, 1);
       buf[2] = ({to_typ})vgetq_lane_f32({in0}.v0, 2);
       buf[3] = ({to_typ})vgetq_lane_f32({in0}.v0, 3);
       buf[4] = ({to_typ})vgetq_lane_f32({in0}.v1, 0);
       buf[5] = ({to_typ})vgetq_lane_f32({in0}.v1, 1);
       buf[6] = ({to_typ})vgetq_lane_f32({in0}.v1, 2);
       buf[7] = ({to_typ})vgetq_lane_f32({in0}.v1, 3);
       return vld1q_{to_suf}(buf);'''.format(**fmtspec2)
    if simd_ext == 'neon128':
        if to_typ == 'f16':
            return to_f16_with_f32
        elif from_typ == 'f16':
            return from_f16_with_f32
        elif to_typ == 'f64':
            return '''nsimd_neon128_vf64 ret;
                      ret.v0 = (f64)vgetq_lane_{from_suf}({in0}, 0);
                      ret.v1 = (f64)vgetq_lane_{from_suf}({in0}, 1);
                      return ret;'''.format(**fmtspec2)
        elif from_typ == 'f64':
            return '''{to_typ} buf[2];
                      buf[0] = ({to_typ}){in0}.v0;
                      buf[1] = ({to_typ}){in0}.v1;
                      return vld1q_{to_suf}(buf);'''.format(**fmtspec2)
        else:
            return 'return vcvtq_{to_suf}_{from_suf}({in0});'. \
                   format(**fmtspec2)
    elif simd_ext == 'aarch64':
        if to_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return vcvtq_{to_suf}_{from_suf}({in0});
                      #else
                        {using_f32}
                      #endif'''.format(using_f32=to_f16_with_f32, **fmtspec2)
        elif from_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return vcvtq_{to_suf}_{from_suf}({in0});
                      #else
                        {using_f32}
                      #endif'''.format(using_f32=from_f16_with_f32, **fmtspec2)
        else:
            return 'return vcvtq_{to_suf}_{from_suf}({in0});'. \
                   format(**fmtspec2)

# -----------------------------------------------------------------------------
# Reinterpret

def reinterpret1(simd_ext, from_typ, to_typ):
    fmtspec2 = fmtspec.copy()
    fmtspec2['to_suf'] = suf(to_typ)
    fmtspec2['from_suf'] = suf(from_typ)
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    if simd_ext in sve:
        return 'return svreinterpret_{to_suf}_{from_suf}({in0});'. \
               format(**fmtspec2)
    to_f16_with_f32 = \
    '''nsimd_{simd_ext}_vf16 ret;
       f32 buf[4];
       buf[0] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 0));
       buf[1] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 1));
       buf[2] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 2));
       buf[3] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 3));
       ret.v0 = vld1q_f32(buf);
       buf[0] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 4));
       buf[1] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 5));
       buf[2] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 6));
       buf[3] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 7));
       ret.v1 = vld1q_f32(buf);
       return ret;'''.format(**fmtspec2)
    from_f16_with_f32 = \
    '''{to_typ} buf[8];
       buf[0] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 0));
       buf[1] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 1));
       buf[2] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 2));
       buf[3] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 3));
       buf[4] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 0));
       buf[5] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 1));
       buf[6] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 2));
       buf[7] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 3));
       return vld1q_{to_suf}(buf);'''.format(**fmtspec2)
    if simd_ext == 'neon128':
        if to_typ == 'f16':
            return to_f16_with_f32
        elif from_typ == 'f16':
            return from_f16_with_f32
        elif to_typ == 'f64':
            return '''nsimd_neon128_vf64 ret;
                      union {{ f64 to; {from_typ} from; }} buf;
                      buf.from = vgetq_lane_{from_suf}({in0}, 0);
                      ret.v0 = buf.to;
                      buf.from = vgetq_lane_{from_suf}({in0}, 1);
                      ret.v1 = buf.to;
                      return ret;'''.format(**fmtspec2)
        elif from_typ == 'f64':
            return '''union {{ f64 from; {to_typ} to; }} buf_;
                      {to_typ} buf[2];
                      buf_.from = {in0}.v0;
                      buf[0] = buf_.to;
                      buf_.from = {in0}.v1;
                      buf[1] = buf_.to;
                      return vld1q_{to_suf}(buf);'''.format(**fmtspec2)
        else:
            return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \
                   format(**fmtspec2)
    elif simd_ext == 'aarch64':
        if to_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return vreinterpretq_{to_suf}_{from_suf}({in0});
                      #else
                        {using_f32}
                      #endif'''.format(using_f32=to_f16_with_f32, **fmtspec2)
        elif from_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return vreinterpretq_{to_suf}_{from_suf}({in0});
                      #else
                        {using_f32}
                      #endif'''.format(using_f32=from_f16_with_f32, **fmtspec2)
        else:
            return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \
                   format(**fmtspec2)

# -----------------------------------------------------------------------------
# reverse

def reverse1(simd_ext, typ):
    armtyp = suf(typ)
    if simd_ext in sve:
        return '''return svrev_{suf}( {in0} );'''.format(**fmtspec)
    elif simd_ext == 'neon128' and typ == 'f64':
        return '''nsimd_neon128_vf64 ret;
                  ret.v0 = {in0}.v1;
                  ret.v1 = {in0}.v0;
                  return ret;'''.format(**fmtspec)
    elif typ in [ 'i64', 'u64', 'f64' ]:
        return '''return vcombine_{armtyp}(vget_high_{armtyp}({in0}),
                                           vget_low_{armtyp}({in0}));'''. \
                                           format(armtyp=armtyp, **fmtspec)
    elif typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_reverse_{simd_ext}_f32(a0.v1);
                  ret.v1 = nsimd_reverse_{simd_ext}_f32(a0.v0);
                  return ret;'''.format(**fmtspec)
    else:
        return '''{in0} = vrev64q_{armtyp}({in0});
                  return vcombine_{armtyp}(vget_high_{armtyp}({in0}),
                                           vget_low_{armtyp}({in0}));'''. \
                                           format(armtyp=armtyp, **fmtspec)

# -----------------------------------------------------------------------------
# Horizontal sum

def addv(simd_ext, typ):

    if simd_ext == 'neon128':
        if typ == 'f64':
            return 'return ({typ})({in0}.v0 + {in0}.v1);'.format(**fmtspec)
        elif typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 {t} tmp = vadd_{suf}(vget_low_{suf}({in0}),
                                      vget_high_{suf}({in0}));
                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 3));
                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 0));
                 return vget_lane_{suf}(tmp, 0);
               #else
                 float32x2_t tmp0 = vadd_f32(vget_low_f32({in0}.v0),
                                             vget_high_f32({in0}.v0));
                 tmp0 = vadd_f32(tmp0, vext_f32(tmp0, tmp0, 1));
                 float32x2_t tmp1 = vadd_f32(vget_low_f32({in0}.v1),
                                             vget_high_f32({in0}.v1));
                 tmp1 = vadd_f32(tmp1, vext_f32(tmp1, tmp1, 1));
                 return nsimd_f32_to_f16(vget_lane_f32(tmp0, 0) +
                                         vget_lane_f32(tmp1, 0));
               #endif''' .format(t=half_neon64_typ(typ), **fmtspec)
        elif typ == 'f32':
            return \
            '''{t} tmp = vadd_{suf}(vget_low_{suf}({in0}),
                                    vget_high_{suf}({in0}));
               tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 1));
               return vget_lane_{suf}(tmp, 0);'''. \
               format(t=half_neon64_typ(typ), **fmtspec)
        elif typ[0] in ['i', 'u']:
            le = 128 // int(typ[1:]);
            return \
            '''{typ} res = ({typ})0;
               {typ} buf[{le}];
               vst1q_{suf}(buf, {in0});
               for (int i = 0; i < {le}; i++) {{
                 res += buf[i];
               }}
               return res;'''. \
               format(le=le, **fmtspec)
    elif simd_ext == 'aarch64':
        if typ == 'f16':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 {t} tmp = vadd_{suf}(vget_low_{suf}({in0}),
                                      vget_high_{suf}({in0}));
                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 3));
                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 0));
                 return vget_lane_{suf}(tmp, 0);
               #else
                 float32x2_t tmp0 = vadd_f32(vget_low_f32({in0}.v0),
                                             vget_high_f32({in0}.v0));
                 tmp0 = vadd_f32(tmp0, vext_f32(tmp0, tmp0, 1));
                 float32x2_t tmp1 = vadd_f32(vget_low_f32({in0}.v1),
                                             vget_high_f32({in0}.v1));
                 tmp1 = vadd_f32(tmp1, vext_f32(tmp1, tmp1, 1));
                 return nsimd_f32_to_f16(vget_lane_f32(tmp0, 0) +
                                         vget_lane_f32(tmp1, 0));
               #endif''' .format(t=half_neon64_typ(typ), **fmtspec)
        elif typ in ['f32', 'f64']:
            return 'return vaddvq_{suf}({in0});'.format(**fmtspec)
    elif simd_ext in sve:
        return 'return svaddv_{suf}({svtrue}, {in0});' .format(**fmtspec)

# -----------------------------------------------------------------------------
# Up convert

def upcvt1(simd_ext, from_typ, to_typ):
    # For integer upcast, due to 2's complement representation
    # _s : signed   -> bigger signed
    # _s : signed   -> bigger unsigned
    # _u : unsigned -> bigger signed
    # _u : unsigned -> bigger unsigned
    if simd_ext in neon:
        if from_typ == 'f16' and to_typ == 'f32':
            return \
            '''#ifdef NSIMD_ARM_FP16
                 nsimd_{simd_ext}_vf32x2 ret;
                 ret.v0 = vcvt_f32_f16(vget_low_{suf}({in0}));
                 ret.v1 = vcvt_f32_f16(vget_high_{suf}({in0}));
                 return ret;
               #else
                 nsimd_{simd_ext}_vf32x2 ret;
                 ret.v0 = {in0}.v0;
                 ret.v1 = {in0}.v1;
                 return ret;
               #endif'''.format(**fmtspec)
        elif from_typ == 'f32' and to_typ == 'f64':
            if simd_ext == 'neon128':
                return \
                '''nsimd_neon128_vf64x2 ret;
                   f32 buf[4];
                   vst1q_f32(buf, {in0});
                   ret.v0.v0 = (f64)buf[0];
                   ret.v0.v1 = (f64)buf[1];
                   ret.v1.v0 = (f64)buf[2];
                   ret.v1.v1 = (f64)buf[3];
                   return ret;'''.format(**fmtspec)
            else:
                return \
                '''nsimd_aarch64_vf64x2 ret;
                   ret.v0 = vcvt_f64_f32(vget_low_{suf}({in0}));
                   ret.v1 = vcvt_f64_f32(vget_high_{suf}({in0}));
                   return ret;'''.format(**fmtspec)
        elif (from_typ in common.itypes and to_typ in common.itypes) or \
             (from_typ in common.utypes and to_typ in common.utypes):
            return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                      ret.v0 = vmovl_{suf}(vget_low_{suf}({in0}));
                      ret.v1 = vmovl_{suf}(vget_high_{suf}({in0}));
                      return ret;'''.format(**fmtspec)
        elif (from_typ in common.itypes and to_typ in common.utypes) or \
             (from_typ in common.utypes and to_typ in common.itypes):
            return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                      ret.v0 = vreinterpretq_{suf_to_typ}_{suf_int_typ}(
                                 vmovl_{suf}(vget_low_{suf}({in0})));
                      ret.v1 = vreinterpretq_{suf_to_typ}_{suf_int_typ}(
                                 vmovl_{suf}(vget_high_{suf}({in0})));
                      return ret;'''. \
                      format(suf_to_typ=suf(to_typ),
                             suf_int_typ=suf(from_typ[0] + to_typ[1:]),
                             **fmtspec)
        else:
            return \
            '''nsimd_{simd_ext}_v{to_typ}x2 ret;
               nsimd_{simd_ext}_v{int_typ}x2 tmp;
               tmp = nsimd_upcvt_{simd_ext}_{int_typ}_{from_typ}({in0});
               ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v0);
               ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v1);
               return ret;'''. \
               format(int_typ=from_typ[0] + to_typ[1:], **fmtspec)

    # Getting here means that we deal with SVE
    if (from_typ in common.itypes and to_typ in common.itypes) or \
       (from_typ in common.utypes and to_typ in common.utypes):
        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                  ret.v0 = svunpklo_{suf_to_typ}({in0});
                  ret.v1 = svunpkhi_{suf_to_typ}({in0});
                  return ret;'''.format(suf_to_typ=suf(to_typ), **fmtspec)
    elif (from_typ in common.itypes and to_typ in common.utypes) or \
         (from_typ in common.utypes and to_typ in common.itypes):
        return \
        '''nsimd_{simd_ext}_v{to_typ}x2 ret;
           ret.v0 = svreinterpret_{suf_to_typ}_{suf_int_typ}(
                      svunpklo_{suf_int_typ}({in0}));
           ret.v1 = svreinterpret_{suf_to_typ}_{suf_int_typ}(
                      svunpkhi_{suf_int_typ}({in0}));
           return ret;'''. \
           format(suf_to_typ=suf(to_typ),
                  suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec)
    elif from_typ in common.iutypes and to_typ in common.ftypes:
        return \
        '''nsimd_{simd_ext}_v{to_typ}x2 ret;
           ret.v0 = svcvt_{suf_to_typ}_{suf_int_typ}_x(
                      {svtrue}, svunpklo_{suf_int_typ}({in0}));
           ret.v1 = svcvt_{suf_to_typ}_{suf_int_typ}_x(
                      {svtrue}, svunpkhi_{suf_int_typ}({in0}));
           return ret;'''. \
           format(suf_to_typ=suf(to_typ),
                  suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec)
    else:
        return \
        '''nsimd_{simd_ext}_v{to_typ}x2 ret;
           ret.v0 = svcvt_{suf_to_typ}_{suf}_x({svtrue}, svzip1_{suf}(
                      {in0}, {in0}));
           ret.v1 = svcvt_{suf_to_typ}_{suf}_x({svtrue}, svzip2_{suf}(
                      {in0}, {in0}));
           return ret;'''.format(suf_to_typ=suf(to_typ), **fmtspec)

# -----------------------------------------------------------------------------
# Down convert

def downcvt1(simd_ext, from_typ, to_typ):
    if simd_ext in neon:
        if from_typ == 'f64' and to_typ == 'f32':
            if simd_ext == 'neon128':
                return '''f32 buf[4];
                          buf[0] = (f32){in0}.v0;
                          buf[1] = (f32){in0}.v1;
                          buf[2] = (f32){in1}.v0;
                          buf[3] = (f32){in1}.v1;
                          return vld1q_f32(buf);'''.format(**fmtspec)
            else:
                return '''return vcombine_f32(vcvt_f32_f64({in0}),
                                              vcvt_f32_f64({in1}));'''. \
                                              format(**fmtspec)
        elif from_typ == 'f32' and to_typ == 'f16':
            return '''#ifdef NSIMD_ARM_FP16
                        return vcombine_f16(vcvt_f16_f32({in0}),
                                            vcvt_f16_f32({in1}));
                      #else
                        nsimd_{simd_ext}_vf16 ret;
                        ret.v0 = {in0};
                        ret.v1 = {in1};
                        return ret;
                      #endif'''.format(**fmtspec)
        elif (from_typ in common.itypes and to_typ in common.itypes) or \
             (from_typ in common.utypes and to_typ in common.utypes):
            return '''return vcombine_{suf_to_typ}(vmovn_{suf}({in0}),
                               vmovn_{suf}({in1}));'''. \
                               format(suf_to_typ=suf(to_typ), **fmtspec)
        elif (from_typ in common.itypes and to_typ in common.itypes) or \
             (from_typ in common.utypes and to_typ in common.utypes):
            return '''return vreinterpretq_{suf_to_typ}(
                               vcombine_{suf_to_typ}(vmovn_{suf}({in0}),
                                 vmovn_{suf}({in1}));'''. \
                                 format(suf_to_typ=suf(to_typ), **fmtspec)
        else:
            return \
            '''return nsimd_downcvt_{simd_ext}_{to_typ}_{int_typ}(
                        nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in0}),
                        nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in1}));'''.\
                        format(int_typ=to_typ[0] + from_typ[1:], **fmtspec)

    # Getting here means that we deal with SVE
    if from_typ in common.iutypes and to_typ in common.iutypes:
        return '''return svuzp1_{suf_to_typ}(
                           svreinterpret_{suf_to_typ}_{suf}({in0}),
                           svreinterpret_{suf_to_typ}_{suf}({in1}));'''. \
                           format(suf_to_typ=suf(to_typ), **fmtspec)
    elif from_typ in common.ftypes and to_typ in common.iutypes:
        return \
        '''return svuzp1_{suf_to_typ}(svreinterpret_{suf_to_typ}_{suf_int_typ}(
                    svcvt_{suf_int_typ}_{suf}_x({svtrue}, {in0})),
                      svreinterpret_{suf_to_typ}_{suf_int_typ}(
                        svcvt_{suf_int_typ}_{suf}_x({svtrue}, {in1})));'''. \
                        format(suf_to_typ=suf(to_typ),
                               suf_int_typ=suf(to_typ[0] + from_typ[1:]),
                               **fmtspec)
    else:
        return \
        '''return svuzp1_{suf_to_typ}(svcvt_{suf_to_typ}_{suf}_x(
                    {svtrue}, {in0}), svcvt_{suf_to_typ}_{suf}_x(
                      {svtrue}, {in1}));'''. \
                    format(suf_to_typ=suf(to_typ), **fmtspec)

# -----------------------------------------------------------------------------
# adds

def adds(simd_ext, from_typ):
    if from_typ in common.ftypes:
        return 'return nsimd_add_{simd_ext}_{from_typ}({in0}, {in1});'. \
               format(**fmtspec)
    if simd_ext in neon:
        return 'return vqaddq_{suf}({in0}, {in1});'.format(**fmtspec)
    else:
        return 'return svqadd_{suf}({in0}, {in1});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# subs

def subs(simd_ext, from_typ):
    if from_typ in common.ftypes:
        return 'return nsimd_sub_{simd_ext}_{from_typ}({in0}, {in1});'. \
               format(**fmtspec)
    elif simd_ext in neon:
        return 'return vqsubq_{suf}({in0}, {in1});'.format(**fmtspec)
    else:
        return 'return svqsub_{suf}({in0}, {in1});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# to_mask

def to_mask1(opts, simd_ext, typ):
    if typ in common.itypes + common.ftypes:
        normal = 'return vreinterpretq_{suf}_u{typnbits}({in0});'. \
                 format(**fmtspec)
    else:
        normal = 'return {in0};'.format(**fmtspec)
    emulate_f16 = '''nsimd_{simd_ext}_vf16 ret;
                     ret.v0 = nsimd_to_mask_{simd_ext}_f32({in0}.v0);
                     ret.v1 = nsimd_to_mask_{simd_ext}_f32({in0}.v1);
                     return ret;'''.format(**fmtspec)
    if simd_ext == 'neon128' and typ == 'f16':
        return emulate_f16
    elif simd_ext == 'neon128' and typ == 'f64':
        return '''nsimd_neon128_vf64 ret;
                  ret.v0 = nsimd_scalar_reinterpret_f64_u64({in0}.v0);
                  ret.v1 = nsimd_scalar_reinterpret_f64_u64({in0}.v1);
                  return ret;'''.format(**fmtspec)
    elif simd_ext == 'aarch64' and typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                    {normal}
                  #else
                    {emulate_f16}
                  #endif'''.format(normal=normal, emulate_f16=emulate_f16)
    elif simd_ext in sve:
        if opts.sve_emulate_bool:
            return 'return svreinterpret_{suf}_u{typnbits}({in0});'. \
                    format(**fmtspec)
        else:
           utyp = 'u{}'.format(fmtspec['typnbits'])
           return '''return svreinterpret_{suf}_{utyp}(svsel_{utyp}(
                          {in0}, svdup_n_{utyp}(({utyp})-1),
                          svdup_n_{utyp}(({utyp})0)));'''. \
                          format(utyp=utyp, **fmtspec)
    else:
        return normal

# -----------------------------------------------------------------------------
# iota

def iota(simd_ext, typ):
    if simd_ext in sve:
        if typ in common.iutypes:
            return 'return svindex_{suf}(0, 1);'.format(**fmtspec)
        else:
            return \
            '''return svcvt_{suf}_s{typnbits}_x({svtrue},
                        svindex_s{typnbits}(0, 1));'''.format(**fmtspec)
    if typ == 'f64' and simd_ext == 'neon128':
        return '''nsimd_neon128_vf64 ret;
                  ret.v0 = 0.0;
                  ret.v1 = 1.0;
                  return ret;'''.format(**fmtspec)
    typ2 = 'f32' if typ == 'f16' else typ
    le = 128 // int(typ[1:])
    iota = ', '.join(['({typ2}){i}'.format(typ2=typ2, i=i) \
                      for i in range(le)])
    normal = '''{typ} buf[{le}] = {{ {iota} }};
                return vld1q_{suf}(buf);'''. \
                format(le=le, iota=iota, **fmtspec)
    if typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                    {normal}
                  #else
                    f32 buf[8] = {{ {iota} }};
                    nsimd_{simd_ext}_vf16 ret;
                    ret.v0 = vld1q_f32(buf);
                    ret.v1 = vld1q_f32(buf + 4);
                    return ret;
                  #endif'''.format(iota=iota, normal=normal, **fmtspec)
    return normal

# -----------------------------------------------------------------------------
# mask_for_loop_tail

def mask_for_loop_tail(simd_ext, typ):
    if typ == 'f16':
        threshold = 'nsimd_f32_to_f16((f32)({in1} - {in0}))'.format(**fmtspec)
    else:
        threshold = '({typ})({in1} - {in0})'.format(**fmtspec)
    if simd_ext == 'sve':
        le = 'nsimd_len_sve_{typ}()'.format(**fmtspec)
    elif simd_ext in fixed_sized_sve:
        le = int(simd_ext[3:]) // int(typ[1:])
    else:
        le = 128 // int(typ[1:])
    return '''if ({in0} >= {in1}) {{
                return nsimd_set1l_{simd_ext}_{typ}(0);
              }}
              if ({in1} - {in0} < {le}) {{
                nsimd_{simd_ext}_v{typ} n =
                      nsimd_set1_{simd_ext}_{typ}({threshold});
                return nsimd_lt_{simd_ext}_{typ}(
                           nsimd_iota_{simd_ext}_{typ}(), n);
              }} else {{
                return nsimd_set1l_{simd_ext}_{typ}(1);
              }}'''.format(le=le, threshold=threshold, **fmtspec)

# -----------------------------------------------------------------------------
# to_logical

def to_logical1(opts, simd_ext, typ):
    if typ in common.iutypes:
        return '''return nsimd_ne_{simd_ext}_{typ}({in0},
                           nsimd_set1_{simd_ext}_{typ}(({typ})0));'''. \
                           format(**fmtspec)
    normal_fp = \
    '''return nsimd_reinterpretl_{simd_ext}_{suf}_{utyp}(
                nsimd_ne_{simd_ext}_{utyp}(
                  nsimd_reinterpret_{simd_ext}_{utyp}_{typ}(
                    {in0}), nsimd_set1_{simd_ext}_{utyp}(({utyp})0)));'''. \
                    format(utyp='u{}'.format(fmtspec['typnbits']), **fmtspec)
    if typ in ['f32', 'f64'] or (typ == 'f16' and simd_ext in sve):
        return normal_fp
    emulate_fp16 = \
    '''nsimd_{simd_ext}_vlf16 ret;
       ret.v0 = nsimd_to_logical_{simd_ext}_f32({in0}.v0);
       ret.v1 = nsimd_to_logical_{simd_ext}_f32({in0}.v1);
       return ret;'''.format(**fmtspec)
    if simd_ext == 'aarch64':
        return '''#ifdef NSIMD_ARM_FP16
                    {normal_fp}
                  #else
                    {emulate_fp16}
                  #endif'''.format(normal_fp=normal_fp,
                                   emulate_fp16=emulate_fp16)
    elif simd_ext == 'neon128':
        return emulate_fp16

# -----------------------------------------------------------------------------
# unpack functions

def zip_unzip_half(func, simd_ext, typ):
    if simd_ext == 'aarch64' or simd_ext in sve:
        if typ =='f16' and simd_ext == 'aarch64':
            if func in ['zip1', 'zip2']:
                return '''\
                #ifdef NSIMD_ARM_FP16
                  return {s}v{op}{q}_{suf}({in0}, {in1});
                #else
                  nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {s}vzip1{q}_f32({in0}.v{i}, {in1}.v{i});
                  ret.v1 = {s}vzip2{q}_f32({in0}.v{i}, {in1}.v{i});
                  return ret;
                #endif
                '''.format(op=func,
                           i = '0' if func in ['zip1', 'uzp1'] else '1',
                           s = 's' if simd_ext in sve else '',
                           q = '' if simd_ext in sve else 'q', **fmtspec)
            else:
                return '''\
                #ifdef NSIMD_ARM_FP16
                  return {s}v{op}{q}_{suf}({in0}, {in1});
                #else
                  nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {s}v{func}{q}_f32({in0}.v0, {in0}.v1);
                  ret.v1 = {s}v{func}{q}_f32({in1}.v0, {in1}.v1);
                  return ret;
                #endif'''.format(op=func, func=func,
                          s = 's' if simd_ext in sve else '',
                          q = '' if simd_ext in sve else 'q', **fmtspec)
        else:
            return 'return {s}v{op}{q}_{suf}({in0}, {in1});'. \
                format(op=func, s = 's' if simd_ext in sve else '',
                       q = '' if simd_ext in sve else 'q', **fmtspec)
    elif simd_ext == 'neon128':
        armop = {'zip1': 'zipq', 'zip2': 'zipq', 'uzp1': 'uzpq',
                 'uzp2': 'uzpq'}
        prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }
        neon_typ = '{}{}x{}x2_t'. \
            format(prefix[typ[0]], typ[1:], 128 // int(typ[1:]))
        if typ == 'f16':
            if func in ['zip1', 'zip2']:
                return '''\
                nsimd_{simd_ext}_v{typ} ret;
                float32x4x2_t tmp = v{op}_f32({in0}.v{i}, {in1}.v{i});
                ret.v0 = tmp.val[0];
                ret.v1 = tmp.val[1];
                return ret;
                '''.format(i = '0' if func == 'zip1' else '1',
                           op=armop[func], **fmtspec)
            else:
                return '''\
                nsimd_{simd_ext}_v{typ} ret;
                float32x4x2_t tmp0 = vuzpq_f32({in0}.v0, {in0}.v1);
                float32x4x2_t tmp1 = vuzpq_f32({in1}.v0, {in1}.v1);
                ret.v0 = tmp0.val[{i}];
                ret.v1 = tmp1.val[{i}];
                return ret;
                '''.format(i = '0' if func == 'uzp1' else '1', **fmtspec)
        elif typ in ['i64', 'u64']:
            return '''\
            {typ} buf0[2], buf1[2];
            {typ} ret[2];
            vst1q_{suf}(buf0, {in0});
            vst1q_{suf}(buf1, {in1});
            ret[0] = buf0[{i}];
            ret[1] = buf1[{i}];
            return vld1q_{suf}(ret);'''. \
                format(**fmtspec, i= '0' if func in ['zip1', 'uzp1'] else '1')
        elif  typ == 'f64' :
            return '''\
            nsimd_{simd_ext}_v{typ} ret;
            ret.v0 = {in0}.v{i};
            ret.v1 = {in1}.v{i};
            return ret;'''. \
                format(**fmtspec, i= '0' if func in ['zip1', 'uzp1'] else '1')
        else :
            return '''\
            {neon_typ} res;
            res = v{op}_{suf}({in0}, {in1});
            return res.val[{i}];'''. \
                format(neon_typ=neon_typ, op=armop[func], **fmtspec,
                       i = '0' if func in ['zip1', 'uzp1'] else '1')

def zip_unzip(func, simd_ext, typ):
    lo_hi = '''\
    nsimd_{simd_ext}_v{typ}x2 ret;
    ret.v0 = nsimd_{func}lo_{simd_ext}_{typ}({in0}, {in1});
    ret.v1 = nsimd_{func}hi_{simd_ext}_{typ}({in0}, {in1});
    return ret;
    '''.format(func='zip' if func == 'zip' else 'unzip', **fmtspec)
    if simd_ext == 'aarch64' or simd_ext in sve:
        content = '''\
        nsimd_{simd_ext}_v{typ}x2 ret;
        ret.v0 = {s}v{func}1{q}_{suf}({in0}, {in1});
        ret.v1 = {s}v{func}2{q}_{suf}({in0}, {in1});
        return ret;'''.format(s = 's' if simd_ext in sve else '',
                              q = '' if simd_ext in sve else 'q',
                              func=func, **fmtspec)
        if typ == 'f16':
            return '''\
            #ifdef NSIMD_ARM_FP16
            {c}
            #else
            {default}
            #endif'''.\
                format(c=content, default=lo_hi, s = 's' if simd_ext in sve else '',
                       **fmtspec)
        else:
            return content
    else:
       prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }
       neon_typ = '{}{}x{}x2_t'.\
               format(prefix[typ[0]], typ[1:], 128 // int(typ[1:]))
       content = '''\
       nsimd_{simd_ext}_v{typ}x2 ret;
       {neon_typ} tmp = v{func}q_{suf}({in0}, {in1});
       ret.v0 = tmp.val[0];
       ret.v1 = tmp.val[1];
       return ret;'''\
           .format(func=func, neon_typ=neon_typ, **fmtspec)
       if typ in ['u64', 'i64', 'f64']:
           return lo_hi
       elif typ == 'f16':
           return '''\
           #ifdef NSIMD_ARM_FP16
           {content}
           #else
           {default}
           #endif'''.\
               format(content=content, default=lo_hi,
                      f='zip' if func == 'zip' else 'unzip', **fmtspec)
       else:
           return content

# -----------------------------------------------------------------------------
# gather

def gather(simd_ext, typ):
    le = max_len(simd_ext, typ)
    real_le = real_len(simd_ext, typ)

    if simd_ext in sve:
        emul = '''int i;
                  {typ} buf[{le}];
                  i{typnbits} offset_buf[{le}];
                  svst1_s{typnbits}({svtrue}, offset_buf, {in1});
                  for (i = 0; i < {real_le}; i++) {{
                    buf[i] = {in0}[offset_buf[i]];
                  }}
                  return svld1_{suf}({svtrue}, buf);'''. \
                  format(le=le, real_le=real_le, **fmtspec)
    else:
        emul = \
        '''nsimd_{simd_ext}_v{typ} ret;
           ret = vdupq_n_{suf}({in0}[vgetq_lane_s{typnbits}({in1}, 0)]);'''. \
           format(**fmtspec) + ''.join([
        '''ret = vsetq_lane_{suf}({in0}[
                     vgetq_lane_s{typnbits}({in1}, {i})], ret, {i});\n'''. \
                     format(i=i, **fmtspec) for i in range(1, le)]) + \
          'return ret;'
    if typ == 'f16':
        if simd_ext in sve:
            return emul
        return '''#ifdef NSIMD_ARM_FP16
                    {emul}
                  #else
                    nsimd_{simd_ext}_vf16 ret;
                    f32 buf[8];
                  '''.format(emul=emul, **fmtspec) + \
                  ''.join(['buf[{i}] = nsimd_f16_to_f32({in0}[' \
                           'vgetq_lane_s16({in1}, {i})]);\n'. \
                           format(i=i, **fmtspec) for i in range(4)]) + \
                  ''.join(['buf[4 + {i}] = nsimd_f16_to_f32({in0}[' \
                           'vgetq_lane_s16({in1}, 4 + {i})]);\n'. \
                           format(i=i, **fmtspec) for i in range(4)]) + \
               '''  ret.v0 = vld1q_f32(buf);
                    ret.v1 = vld1q_f32(buf + 4);
                    return ret;
                  #endif'''.format(**fmtspec)
    if simd_ext == 'neon128' and typ == 'f64':
        return '''nsimd_neon128_vf64 ret;
                  i64 offset_buf[2];
                  vst1q_s64(offset_buf, {in1});
                  ret.v0 = {in0}[offset_buf[0]];
                  ret.v1 = {in0}[offset_buf[1]];
                  return ret;'''.format(**fmtspec)
    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:
        return emul
    # getting here means SVE
    return 'return svld1_gather_s{typnbits}index_{suf}({svtrue}, {in0}, ' \
           '{in1});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# linear gather

def gather_linear(simd_ext, typ):
    if simd_ext in sve:
        if typ in ['i8', 'u8', 'i16', 'u16', 'f16']:
            le = max_len(simd_ext, typ)
            real_le = real_len(simd_ext, typ)
            return '''{typ} buf[{le}];
                      int i;
                      for (i = 0; i < {real_le}; i++) {{
                        buf[i] = {in0}[i * {in1}];
                      }}
                      return svld1_{suf}({svtrue}, buf);'''. \
                      format(le=le, real_le=real_le, **fmtspec)
        else:
            return 'return svld1_gather_s{typnbits}index_{suf}({svtrue}, ' \
                   '{in0}, svindex_s{typnbits}(0, (i{typnbits}){in1}));'. \
                   format(**fmtspec)
    # getting here means neon128 and aarch64
    intrinsic = '''nsimd_{simd_ext}_v{typ} ret;
                   ret = vdupq_n_{suf}({in0}[0]);
                '''.format(**fmtspec) + ''.join([
                  'ret = vsetq_lane_{suf}({in0}[{i} * {in1}], ret, {i});\n'. \
                  format(i=i, **fmtspec) \
                  for i in range(1, 128 // int(fmtspec['typnbits']))]) + \
               '''return ret;'''
    if typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                    {intrinsic}
                  #else
                    nsimd_{simd_ext}_vf16 ret;
                    f32 buf[8];
                    int i;
                    for (i = 0; i < 8; i++) {{
                      buf[i] = nsimd_f16_to_f32({in0}[i * {in1}]);
                    }}
                    ret.v0 = vld1q_f32(buf);
                    ret.v1 = vld1q_f32(buf + 4);
                    return ret;
                  #endif'''.format(intrinsic=intrinsic, **fmtspec)
    if typ == 'f64' and simd_ext == 'neon128':
        return '''nsimd_neon128_vf64 ret;
                  ret.v0 = {in0}[0];
                  ret.v1 = {in0}[{in1}];
                  return ret;'''.format(**fmtspec)
    return intrinsic

# -----------------------------------------------------------------------------
# masked gather

def maskoz_gather(oz, simd_ext, typ):
    le = max_len(simd_ext, typ)
    real_le = real_len(simd_ext, typ)

    if simd_ext in sve:
        utyp = 'u{typnbits}'.format(**fmtspec)
        store = '''svst1_s{typnbits}({svtrue}, offset_buf, {in2});
                   svst1_{utyp}({svtrue}, mask, svsel_{utyp}(
                       {in0}, svdup_n_{utyp}(({utyp})-1), svdup_n_{utyp}(
                         ({utyp})0)));
                         '''.format(utyp=utyp, **fmtspec)
        if oz == 'z':
            store += 'svst1_{suf}({svtrue}, buf, svdup_n_{suf}(({typ})0));'. \
                     format(**fmtspec)
        else:
            store += 'svst1_{suf}({svtrue}, buf, {in3});'.format(**fmtspec)
        load = 'svld1_{suf}({svtrue}, buf)'.format(**fmtspec)
    else:
        store = '''vst1q_s{typnbits}(offset_buf, {in2});
                   vst1q_u{typnbits}(mask, {in0});'''.format(**fmtspec)
        if oz == 'z':
            store += 'vst1q_{suf}(buf, vdupq_n_{suf}(({typ})0));'. \
                     format(**fmtspec)
        else:
            store += 'vst1q_{suf}(buf, {in3});'.format(**fmtspec)
        load = 'vld1q_{suf}(buf)'.format(**fmtspec)

    emul = '''int i;
              {typ} buf[{le}];
              u{typnbits} mask[{le}];
              i{typnbits} offset_buf[{le}];
              {store}
              for (i = 0; i < {real_le}; i++) {{
                if (mask[i]) {{
                  buf[i] = {in1}[offset_buf[i]];
                }}
              }}
              return {load};'''. \
              format(le=le, real_le=real_le, store=store, load=load, **fmtspec)
    if typ == 'f16':
        if simd_ext in sve:
            return emul
        if oz == 'z':
            oz0 = 'vdupq_n_f32(0.0f)'
            oz1 = oz0
        else:
            oz0 = '{in3}.v0'.format(**fmtspec)
            oz1 = '{in3}.v1'.format(**fmtspec)
        return '''#ifdef NSIMD_ARM_FP16
                    {emul}
                  #else
                    nsimd_{simd_ext}_vf16 ret;
                    int i;
                    f32 buf[{le}];
                    u32 mask[{le}];
                    i16 offset_buf[{le}];
                    vst1q_s16(offset_buf, {in2});
                    vst1q_f32(buf, {oz0});
                    vst1q_f32(buf + {leo2}, {oz1});
                    vst1q_u32(mask, {in0}.v0);
                    vst1q_u32(mask + {leo2}, {in0}.v1);
                    for (i = 0; i < {le}; i++) {{
                      if (mask[i]) {{
                        buf[i] = nsimd_f16_to_f32({in1}[offset_buf[i]]);
                      }}
                    }}
                    ret.v0 = vld1q_f32(buf);
                    ret.v1 = vld1q_f32(buf + {leo2});
                    return ret;
                  #endif'''.format(emul=emul, leo2=le // 2, le=le, oz0=oz0,
                                   oz1=oz1, **fmtspec)
    if simd_ext == 'neon128' and typ == 'f64':
        oz0 = '0.0' if oz == 'z' else '{in3}.v0'.format(**fmtspec)
        oz1 = '0.0' if oz == 'z' else '{in3}.v1'.format(**fmtspec)
        return '''nsimd_neon128_vf64 ret;
                  i64 offset_buf[2];
                  vst1q_s64(offset_buf, {in2});
                  if ({in0}.v0) {{
                    ret.v0 = {in1}[offset_buf[0]];
                  }} else {{
                    ret.v0 = {oz0};
                  }}
                  if ({in0}.v1) {{
                    ret.v1 = {in1}[offset_buf[1]];
                  }} else {{
                    ret.v1 = {oz1};
                  }}
                  return ret;'''.format(oz0=oz0, oz1=oz1, **fmtspec)
    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:
        return emul
    # getting here means SVE
    oz0 = 'svdup_n_{suf}(({typ})0)'.format(**fmtspec) if oz == 'z' \
          else '{in3}'.format(**fmtspec)
    return '''return svsel_{suf}({in0}, svld1_gather_s{typnbits}index_{suf}(
                         {in0}, {in1}, {in2}), {oz0});'''. \
                         format(oz0=oz0, **fmtspec)

# -----------------------------------------------------------------------------
# scatter

def scatter(simd_ext, typ):
    le = max_len(simd_ext, typ)
    real_le = real_len(simd_ext, typ)

    if simd_ext in sve:
        emul = '''int i;
                  {typ} buf[{le}];
                  i{typnbits} offset_buf[{le}];
                  svst1_s{typnbits}({svtrue}, offset_buf, {in1});
                  svst1_{suf}({svtrue}, buf, {in2});
                  for (i = 0; i < {real_le}; i++) {{
                    {in0}[offset_buf[i]] = buf[i];
                  }}'''.format(le=le, real_le=real_le, **fmtspec)
    else:
        emul = '\n'.join(['{in0}[vgetq_lane_s{typnbits}({in1}, {i})] = ' \
                          'vgetq_lane_{suf}({in2}, {i});\n'. \
                          format(i=i, **fmtspec) for i in range(int(le))])

    if typ == 'f16':
        if simd_ext in sve:
            return emul
        return '''#ifdef NSIMD_ARM_FP16
                    {emul}
                  #else
                  '''.format(emul=emul) + \
                  '\n'.join(['{in0}[vgetq_lane_s16({in1}, {i})] = ' \
                             'nsimd_f32_to_f16(vgetq_lane_f32({in2}.v0, '
                             '{i}));\n'.format(i=i, **fmtspec) \
                             for i in range(4)]) + \
                  '\n'.join(['{in0}[vgetq_lane_s16({in1}, 4 + {i})] = ' \
                             'nsimd_f32_to_f16(vgetq_lane_f32({in2}.v1, '
                             '{i}));\n'.format(i=i, **fmtspec) \
                             for i in range(4)]) + \
               '''
                  #endif'''
    if simd_ext == 'neon128' and typ == 'f64':
        return '''i64 offset_buf[2];
                  vst1q_s64(offset_buf, {in1});
                  {in0}[offset_buf[0]] = {in2}.v0;
                  {in0}[offset_buf[1]] = {in2}.v1;'''.format(**fmtspec)
    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:
        return emul
    # getting here means SVE
    return 'svst1_scatter_s{typnbits}index_{suf}({svtrue}, {in0}, ' \
           '{in1}, {in2});'.format(le=le, **fmtspec)

# -----------------------------------------------------------------------------
# linear scatter

def scatter_linear(simd_ext, typ):
    if simd_ext in sve:
        if typ in ['i8', 'u8', 'i16', 'u16', 'f16']:
            le = max_len(simd_ext, typ)
            real_le = real_len(simd_ext, typ)
            return '''{typ} buf[{le}];
                      int i;
                      svst1_{suf}({svtrue}, buf, {in2});
                      for (i = 0; i < {real_le}; i++) {{
                        {in0}[i * {in1}] = buf[i];
                      }}'''.format(le=le, real_le=real_le, **fmtspec)
        else:
            return 'svst1_scatter_s{typnbits}index_{suf}({svtrue}, {in0}, ' \
                   'svindex_s{typnbits}(0, (i{typnbits}){in1}), {in2});'. \
                   format(**fmtspec)
    # getting here means neon128 and aarch64
    intrinsic = '\n'.join([
      '{in0}[{i} * {in1}] = vgetq_lane_{suf}({in2}, {i});'. \
      format(i=i, **fmtspec) for i in range(128 // int(fmtspec['typnbits']))])
    if typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                    {intrinsic}
                  #else
                    f32 buf[8];
                    int i;
                    vst1q_f32(buf, {in2}.v0);
                    vst1q_f32(buf + 4, {in2}.v1);
                    for (i = 0; i < 8; i++) {{
                      {in0}[i * {in1}] = nsimd_f32_to_f16(buf[i]);
                    }}
                  #endif'''.format(intrinsic=intrinsic, **fmtspec)
    if typ == 'f64' and simd_ext == 'neon128':
        return '''{in0}[0] = {in2}.v0;
                  {in0}[{in1}] = {in2}.v1;'''.format(**fmtspec)
    return intrinsic

# -----------------------------------------------------------------------------
# mask_scatter

def mask_scatter(simd_ext, typ):
    le = max_len(simd_ext, typ)
    real_le = real_len(simd_ext, typ)

    if simd_ext in sve:
        store = '''svst1_s{typnbits}({svtrue}, offset_buf, {in2});
                   svst1_u{typnbits}({svtrue}, mask, svsel_u{typnbits}(
                       {in0}, svdup_n_u{typnbits}((u{typnbits})1),
                              svdup_n_u{typnbits}((u{typnbits})0)));
                   svst1_{suf}({svtrue}, buf, {in3});'''.format(**fmtspec)
    else:
        store = '''vst1q_s{typnbits}(offset_buf, {in2});
                   vst1q_{suf}(buf, {in3});
                   vst1q_u{typnbits}(mask, {in0});'''.format(**fmtspec)

    emul = '''int i;
              {typ} buf[{le}];
              u{typnbits} mask[{le}];
              i{typnbits} offset_buf[{le}];
              {store}
              for (i = 0; i < {real_le}; i++) {{
                if (mask[i]) {{
                  {in1}[offset_buf[i]] = buf[i];
                }}
              }}'''.format(le=le, real_le=real_le, store=store, **fmtspec)
    if typ == 'f16':
        if simd_ext in sve:
            return emul
        return '''#ifdef NSIMD_ARM_FP16
                    {emul}
                  #else
                    int i;
                    f32 buf[{le}];
                    u32 mask[{le}];
                    i16 offset_buf[{le}];
                    vst1q_s16(offset_buf, {in2});
                    vst1q_f32(buf, {in3}.v0);
                    vst1q_f32(buf + {leo2}, {in3}.v1);
                    vst1q_u32(mask, {in0}.v0);
                    vst1q_u32(mask + {leo2}, {in0}.v1);
                    for (i = 0; i < {le}; i++) {{
                      if (mask[i]) {{
                        {in1}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]);
                      }}
                    }}
                  #endif'''.format(emul=emul, le=le, leo2=le // 2, **fmtspec)
    if simd_ext == 'neon128' and typ == 'f64':
        return '''i64 offset_buf[2];
                  vst1q_s64(offset_buf, {in2});
                  if ({in0}.v0) {{
                    {in1}[offset_buf[0]] = {in3}.v0;
                  }}
                  if ({in0}.v1) {{
                    {in1}[offset_buf[1]] = {in3}.v1;
                  }}'''.format(**fmtspec)
    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:
        return emul
    # getting here means SVE
    return 'svst1_scatter_s{typnbits}index_{suf}({in0}, {in1}, ' \
           '{in2}, {in3});'.format(le=le, **fmtspec)


# -----------------------------------------------------------------------------
# get_impl function

def get_impl(opts, func, simd_ext, from_typ, to_typ):
    global fmtspec

    simd_ext2 = simd_ext if not simd_ext in fixed_sized_sve else 'sve'

    fmtspec = {
      'simd_ext': simd_ext,
      'simd_ext2': simd_ext2,
      'typ': from_typ,
      'from_typ': from_typ,
      'to_typ': to_typ,
      'suf': suf(from_typ),
      'in0': common.in0,
      'in1': common.in1,
      'in2': common.in2,
      'in3': common.in3,
      'in4': common.in4,
      'in5': common.in5,
      'typnbits': from_typ[1:],
      'svtrue': 'svptrue_b{}()'.format(from_typ[1:]),
      'svetyp': sve_typ(from_typ),
    }

    impls = {
        'loada': lambda: load1234(opts, simd_ext, from_typ, 1),
        'masko_loada1': lambda: maskoz_load('o', simd_ext, from_typ),
        'maskz_loada1': lambda: maskoz_load('z', simd_ext, from_typ),
        'load2a': lambda: load1234(opts, simd_ext, from_typ, 2),
        'load3a': lambda: load1234(opts, simd_ext, from_typ, 3),
        'load4a': lambda: load1234(opts, simd_ext, from_typ, 4),
        'loadu': lambda: load1234(opts, simd_ext, from_typ, 1),
        'masko_loadu1': lambda: maskoz_load('o', simd_ext, from_typ),
        'maskz_loadu1': lambda: maskoz_load('z', simd_ext, from_typ),
        'load2u': lambda: load1234(opts, simd_ext, from_typ, 2),
        'load3u': lambda: load1234(opts, simd_ext, from_typ, 3),
        'load4u': lambda: load1234(opts, simd_ext, from_typ, 4),
        'storea': lambda: store1234(opts, simd_ext, from_typ, 1),
        'mask_storea1': lambda: mask_store(simd_ext, from_typ),
        'store2a': lambda: store1234(opts, simd_ext, from_typ, 2),
        'store3a': lambda: store1234(opts, simd_ext, from_typ, 3),
        'store4a': lambda: store1234(opts, simd_ext, from_typ, 4),
        'storeu': lambda: store1234(opts, simd_ext, from_typ, 1),
        'mask_storeu1': lambda: mask_store(simd_ext, from_typ),
        'store2u': lambda: store1234(opts, simd_ext, from_typ, 2),
        'store3u': lambda: store1234(opts, simd_ext, from_typ, 3),
        'store4u': lambda: store1234(opts, simd_ext, from_typ, 4),
        'gather': lambda: gather(simd_ext, from_typ),
        'gather_linear': lambda: gather_linear(simd_ext, from_typ),
        'maskz_gather': lambda: maskoz_gather('z', simd_ext, from_typ),
        'masko_gather': lambda: maskoz_gather('o', simd_ext, from_typ),
        'scatter': lambda: scatter(simd_ext, from_typ),
        'scatter_linear': lambda: scatter_linear(simd_ext, from_typ),
        'mask_scatter': lambda: mask_scatter(simd_ext, from_typ),
        'andb': lambda: binop2("andb", simd_ext2, from_typ),
        'xorb': lambda: binop2("xorb", simd_ext2, from_typ),
        'orb': lambda: binop2("orb", simd_ext2, from_typ),
        'andl': lambda: lop2(opts, "andl", simd_ext2, from_typ),
        'xorl': lambda: lop2(opts, "xorl", simd_ext2, from_typ),
        'orl': lambda: lop2(opts, "orl", simd_ext2, from_typ),
        'notb': lambda: not1(simd_ext2, from_typ),
        'notl': lambda: lnot1(opts, simd_ext2, from_typ),
        'andnotb': lambda: binop2("andnotb", simd_ext2, from_typ),
        'andnotl': lambda: lop2(opts, "andnotl", simd_ext2, from_typ),
        'add': lambda: addsub("add", simd_ext2, from_typ),
        'sub': lambda: addsub("sub", simd_ext2, from_typ),
        'adds': lambda: adds(simd_ext2, from_typ),
        'subs': lambda: subs(simd_ext2, from_typ),
        'div': lambda: div2(simd_ext2, from_typ),
        'sqrt': lambda: sqrt1(simd_ext2, from_typ),
        'len': lambda: len1(simd_ext, from_typ),
        'mul': lambda: mul2(simd_ext2, from_typ),
        'shl': lambda: shl_shr("shl", simd_ext2, from_typ),
        'shr': lambda: shl_shr("shr", simd_ext2, from_typ),
        'shra': lambda: shra(simd_ext2, from_typ),
        'set1': lambda: set1(simd_ext2, from_typ),
        'set1l': lambda: lset1(simd_ext2, from_typ),
        'eq': lambda: cmp2(opts, "eq", simd_ext2, from_typ),
        'lt': lambda: cmp2(opts, "lt", simd_ext2, from_typ),
        'le': lambda: cmp2(opts, "le", simd_ext2, from_typ),
        'gt': lambda: cmp2(opts, "gt", simd_ext2, from_typ),
        'ge': lambda: cmp2(opts, "ge", simd_ext2, from_typ),
        'ne': lambda: neq2(opts, simd_ext2, from_typ),
        'if_else1': lambda: if_else3(opts, simd_ext2, from_typ),
        'min': lambda: minmax2("min", simd_ext2, from_typ),
        'max': lambda: minmax2("max", simd_ext2, from_typ),
        'loadla': lambda: loadl(True, simd_ext2, from_typ),
        'loadlu': lambda: loadl(False, simd_ext2, from_typ),
        'storela': lambda: storel(True, simd_ext2, from_typ),
        'storelu': lambda: storel(False, simd_ext2, from_typ),
        'abs': lambda: abs1(simd_ext2, from_typ),
        'fma': lambda: fmafnma3("fma", simd_ext2, from_typ),
        'fnma': lambda: fmafnma3("fnma", simd_ext2, from_typ),
        'fms': lambda: fmsfnms3("fms", simd_ext2, from_typ),
        'fnms': lambda: fmsfnms3("fnms", simd_ext2, from_typ),
        'ceil': lambda: round1("ceil", simd_ext2, from_typ),
        'floor': lambda: round1("floor", simd_ext2, from_typ),
        'trunc': lambda: round1("trunc", simd_ext2, from_typ),
        'round_to_even': lambda: round1("round_to_even", simd_ext2, from_typ),
        'all': lambda: allany1(opts, "all", simd_ext2, from_typ),
        'any': lambda: allany1(opts, "any", simd_ext2, from_typ),
        'reinterpret': lambda: reinterpret1(simd_ext2, from_typ, to_typ),
        'reinterpretl': lambda: reinterpretl1(simd_ext2, from_typ, to_typ),
        'cvt': lambda: convert1(simd_ext2, from_typ, to_typ),
        'rec11': lambda: recs1("rec11", simd_ext2, from_typ),
        'rec8': lambda: recs1("rec8", simd_ext2, from_typ),
        'rsqrt11': lambda: recs1("rsqrt11", simd_ext2, from_typ),
        'rsqrt8': lambda: recs1("rsqrt8", simd_ext2, from_typ),
        'rec': lambda: recs1("rec", simd_ext2, from_typ),
        'neg': lambda: neg1(simd_ext2, from_typ),
        'nbtrue': lambda: nbtrue1(opts, simd_ext2, from_typ),
        'reverse': lambda: reverse1(simd_ext2, from_typ),
        'addv': lambda: addv(simd_ext2, from_typ),
        'upcvt': lambda: upcvt1(simd_ext2, from_typ, to_typ),
        'downcvt': lambda: downcvt1(simd_ext2, from_typ, to_typ),
        'to_logical': lambda: to_logical1(opts, simd_ext2, from_typ),
        'to_mask': lambda: to_mask1(opts, simd_ext2, from_typ),
        'ziplo': lambda: zip_unzip_half("zip1", simd_ext2, from_typ),
        'ziphi': lambda: zip_unzip_half("zip2", simd_ext2, from_typ),
        'unziplo': lambda: zip_unzip_half("uzp1", simd_ext2, from_typ),
        'unziphi': lambda: zip_unzip_half("uzp2", simd_ext2, from_typ),
        'zip' : lambda: zip_unzip("zip", simd_ext2, from_typ),
        'unzip' : lambda: zip_unzip("uzp", simd_ext2, from_typ),
        'mask_for_loop_tail': lambda : mask_for_loop_tail(simd_ext, from_typ),
        'iota': lambda : iota(simd_ext2, from_typ)
    }
    if simd_ext not in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if not from_typ in common.types:
        raise ValueError('Unknown type "{}"'.format(from_typ))
    if not func in impls:
        return common.NOT_IMPLEMENTED
    else:
        return impls[func]()


================================================
FILE: egg/platform_cpu.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# This file gives the implementation of platform CPU, i.e. scalar emulation.
# Reading this file is straightforward. For each function, e.g. the addition,
# code looks like:
#
#     return 'return {} + {};'.format(common.in0, common.in1)
#
# with an 'if' before to handle the FP16 special case.

import common
import scalar

# -----------------------------------------------------------------------------
# Emulation parameters
#
# When emulating, we need to choose a vector length to fit the philosophy of
# SIMD. By default we choose 64 bits. It must be a multiple of 64 bits.

NBITS = common.CPU_NBITS

def get_nb_el(typ):
    return NBITS // int(typ[1:])

# -----------------------------------------------------------------------------
# Implementation of mandatory functions for this module

def get_simd_exts():
    return ['cpu']

def get_prev_simd_ext(simd_ext):
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return ''

def get_simd_strings(simd_ext):
    if simd_ext == 'cpu':
        return ['cpu']
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

def emulate_fp16(simd_ext):
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return True

def get_type(opts, simd_ext, typ, nsimd_typ):
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    typ2 = typ if typ != 'f16' else 'f32'
    members = '\n'.join('{} v{};'.format(typ2, i) \
                        for i in range(0, get_nb_el(typ)))
    return 'typedef struct {{ {} }} {};'.format(members, nsimd_typ)

def get_logical_type(opts, simd_ext, typ, nsimd_typ):
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    members = '\n'.join('unsigned int v{};'.format(i) \
                        for i in range(0, get_nb_el(typ)))
    return 'typedef struct {{ {} }} {};'.format(members, nsimd_typ)

def get_nb_registers(simd_ext):
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return '1'

def has_compatible_SoA_types(simd_ext):
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return False

def get_additional_include(func, platform, simd_ext):
    if func in ['adds', 'subs', 'orb', 'andb', 'andnotb', 'xorb', 'min', 'max'
                'notb', 'sqrt', 'shr', 'shl', 'shra', 'abs', 'fma', 'fnma',
                'fms', 'fnms', 'ceil', 'floor', 'trunc', 'round_to_even',
                'rec11', 'rec8', 'rsqrt11', 'rsqrt8', 'rec', 'neg',
                'lgamma_u10', 'tgamma_u10', 'erf_u10', 'erfc_u15']:
        return '''#include <nsimd/scalar_utilities.h>
                  '''
    elif func == 'zip':
        return '''#include <nsimd/cpu/cpu/ziplo.h>
                  #include <nsimd/cpu/cpu/ziphi.h>
                  '''
    elif func == 'unzip':
         return '''#include <nsimd/cpu/cpu/unziplo.h>
                   #include <nsimd/cpu/cpu/unziphi.h>
                  '''
    return ''

# -----------------------------------------------------------------------------
# Returns C code for func

fmtspec = {}

def repeat_stmt(fmt, typ):
    return '\n'.join(fmt.format(i=i) for i in range(0, get_nb_el(typ)))

# -----------------------------------------------------------------------------

def func_body(fmt, typ2, logical = False):
    return '''nsimd_cpu_v{logical}{typ2} ret;
              {content}
              return ret;'''.format(logical='l' if logical else '', typ2=typ2,
                                    content=repeat_stmt(fmt, typ2), **fmtspec)

# -----------------------------------------------------------------------------

def op2(op, typ):
    return func_body('ret.v{{i}} = {cast}({in0}.v{{i}} {op} {in1}.v{{i}});'. \
                     format(cast='({})'.format(typ) if typ in common.iutypes \
                            else '', op=op, **fmtspec), typ)

# -----------------------------------------------------------------------------

def lop2(op, typ):
    return func_body('ret.v{{i}} = {in0}.v{{i}} {op} {in1}.v{{i}};'. \
                     format(op=op, **fmtspec), typ, True)

# -----------------------------------------------------------------------------

def landnot2(typ):
    return func_body('ret.v{{i}} = {in0}.v{{i}} & (~{in1}.v{{i}});'.\
                     format(**fmtspec), typ, True)

# -----------------------------------------------------------------------------

def lnot1(typ):
    return func_body('ret.v{{i}} = ~{in0}.v{{i}};'.\
                     format(**fmtspec), typ, True)

# -----------------------------------------------------------------------------

def scalar_impl(func, typ, arity):
    typ2 = 'f32' if typ == 'f16' else typ
    # special case for shl, shr, shra
    if func in ['shl', 'shr', 'shra']:
        args = '{in0}.v{{i}}, {in1}'.format(**fmtspec)
    else:
        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                          + '.v{i}' for i in range(arity)])
    return func_body('ret.v{{i}} = nsimd_scalar_{func}_{typ2}({args});'. \
                     format(func=func, typ2=typ2, args=args, **fmtspec), typ)

# -----------------------------------------------------------------------------

def cmp2(op, typ):
    return '''nsimd_cpu_vl{typ} ret;
              {content}
              return ret;'''.format(content=repeat_stmt(
              '''ret.v{{i}} = (u32)({in0}.v{{i}} {op} {in1}.v{{i}}
                                    ? -1 : 0);'''. \
                                    format(op=op, **fmtspec), typ), **fmtspec)

# -----------------------------------------------------------------------------

def set1(typ):
    if typ == 'f16':
        content = repeat_stmt('ret.v{{i}} = nsimd_f16_to_f32({in0});'. \
                              format(**fmtspec), typ)
    else:
        content = repeat_stmt('ret.v{{i}} = {in0};'.format(**fmtspec), typ)
    return '''nsimd_cpu_v{typ} ret;
              {content}
              return ret;'''.format(content=content, **fmtspec)

# -----------------------------------------------------------------------------

def set1l(typ):
    return func_body('ret.v{{i}} = (u32)({in0} ? -1 : 0);'. \
                     format(**fmtspec), typ, True)

# -----------------------------------------------------------------------------

def load(typ):
    if typ == 'f16':
        content = repeat_stmt(
                  'ret.v{{i}} = nsimd_u16_to_f32(((u16 *){in0})[{{i}}]);'. \
                  format(**fmtspec), typ)
    else:
        content = repeat_stmt('ret.v{{i}} = {in0}[{{i}}];'.format(**fmtspec),
                  typ)
    return '''nsimd_cpu_v{typ} ret;
              {content}
              return ret;'''.format(content=content, **fmtspec)

# -----------------------------------------------------------------------------

def maskoz_load(oz, typ):
    if typ == 'f16':
        else_value = '0.0f' if oz == 'z' else '{in2}.v{{i}}'.format(**fmtspec)
        content = repeat_stmt(
                  '''ret.v{{i}} = {in0}.v{{i}}
                                ? nsimd_u16_to_f32(((u16 *){in1})[{{i}}])
                                : {else_value};'''. \
                                format(else_value=else_value, **fmtspec), typ)
    else:
        else_value = '({typ})0'.format(**fmtspec) if oz == 'z' else \
                     '{in2}.v{{i}}'.format(**fmtspec)
        content = repeat_stmt(
                  'ret.v{{i}} = {in0}.v{{i}} ? {in1}[{{i}}] : {else_value};'. \
                  format(else_value=else_value, **fmtspec), typ)
    return '''nsimd_cpu_v{typ} ret;
              {content}
              return ret;'''.format(content=content, **fmtspec)

# -----------------------------------------------------------------------------

def load_deg234(typ, deg):
    if typ == 'f16':
        buf = repeat_stmt(
              '''ret.v{{{{j}}}}.v{{i}} =
                     nsimd_u16_to_f32(
                       ((u16 *){in0})[{deg} * {{i}} + {{{{j}}}}]);'''. \
                       format(deg=deg, **fmtspec), typ)
    else:
        buf = repeat_stmt(
              'ret.v{{{{j}}}}.v{{i}} = {in0}[{deg} * {{i}} + {{{{j}}}}];'. \
              format(deg=deg, **fmtspec), typ)
    content = '\n'.join(buf.format(j=j) for j in range(0, deg))
    return '''nsimd_cpu_v{typ}x{deg} ret;
              {content}
              return ret;'''.format(deg=deg, content=content, **fmtspec)

# -----------------------------------------------------------------------------

def store_deg234(typ, deg):
    content = ''
    for i in range(0, get_nb_el(typ)):
        for j in range(1, deg + 1):
            arg = fmtspec['in{}'.format(j)]
            if typ == 'f16':
                content += \
                '''((u16 *){in0})[{deg} * {i} + {j}] =
                       nsimd_f32_to_u16({arg}.v{i});\n'''. \
                       format(deg=deg, i=i, j=j - 1, arg=arg, **fmtspec)
            else:
                content += \
                '{in0}[{deg} * {i} + {j}] = {arg}.v{i};\n'. \
                format(deg=deg, i=i, j=j - 1, arg=arg, **fmtspec)
    return content[:-1]

# -----------------------------------------------------------------------------

def loadl(typ):
    if typ == 'f16':
        content = repeat_stmt(
                  '''ret.v{{i}} = (u32)(nsimd_u16_to_f32(((u16 *){in0})[{{i}}])
                                      == 0.0f ? 0 : -1);'''. \
                                      format(**fmtspec), typ)
    else:
        content = repeat_stmt(
                  '''ret.v{{i}} = (u32)({in0}[{{i}}] == ({typ})0
                                        ? 0 : -1);'''. \
                                        format(**fmtspec), typ)
    return '''nsimd_cpu_vl{typ} ret;
              {content}
              return ret;'''.format(content=content, **fmtspec)

# -----------------------------------------------------------------------------

def store(typ):
    if typ == 'f16':
        return repeat_stmt(
               '((u16*){in0})[{{i}}] = nsimd_f32_to_u16({in1}.v{{i}});'. \
               format(**fmtspec), typ)
    else:
        return repeat_stmt('{in0}[{{i}}] = {in1}.v{{i}};'. \
                           format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def mask_store(typ):
    if typ == 'f16':
        return repeat_stmt(
               '''if ({in0}.v{{i}}) {{{{
                    ((u16*){in1})[{{i}}] = nsimd_f32_to_u16({in2}.v{{i}});
                  }}}}'''.format(**fmtspec), typ)
    else:
        return repeat_stmt('''if ({in0}.v{{i}}) {{{{
                                {in1}[{{i}}] = {in2}.v{{i}};
                              }}}}'''.format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def storel(typ):
    if typ == 'f16':
        content = repeat_stmt(
                  '''((u16*){in0})[{{i}}] = (u16)({in1}.v{{i}} == (u32)0
                                            ? nsimd_f32_to_u16(0.0f)
                                            : nsimd_f32_to_u16(1.0f));'''. \
                                            format(**fmtspec), typ)
    else:
        content = repeat_stmt(
                  '''{in0}[{{i}}] = ({typ})({in1}.v{{i}} == (u32)0
                                  ? ({typ})0 : ({typ})1);'''. \
                                  format(**fmtspec), typ)
    return content

# -----------------------------------------------------------------------------

def if_else1(typ):
    typ2 = 'f32' if typ == 'f16' else typ
    return func_body(
           '''ret.v{{i}} = ({typ2})({in0}.v{{i}} != (u32)0
                                    ? {in1}.v{{i}} : {in2}.v{{i}});'''. \
                                    format(typ2=typ2, **fmtspec), typ)

# -----------------------------------------------------------------------------

def all_any(typ, func):
    op = '&&' if func == 'all' else '||'
    if get_nb_el(typ) == 1:
        cond = '{in0}.v0 == (u32)-1'.format(**fmtspec)
    else:
        cond = op.join('({in0}.v{i} == (u32)-1)'.format(i=i, **fmtspec) \
                       for i in range(0, get_nb_el(typ)))
    return '''if ({cond}) {{
                return -1;
              }} else {{
                return 0;
              }}'''.format(cond=cond)

# -----------------------------------------------------------------------------

def reinterpret1(from_typ, to_typ):
    if from_typ == to_typ:
        return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec),
                         to_typ)
    return '''char buf[{len}];
              nsimd_storeu_cpu_{from_typ}(({from_typ} *)buf, {in0});
              return nsimd_loadu_cpu_{to_typ}(({to_typ} *)buf);'''. \
              format(len=NBITS // 8, **fmtspec)

# -----------------------------------------------------------------------------

def reinterpretl1(from_typ, to_typ):
    return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec), to_typ,
                     True);

# -----------------------------------------------------------------------------

def convert1(from_typ, to_typ):
    if to_typ == from_typ:
        return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec),
                         to_typ)
    typ2 = 'f32' if to_typ == 'f16' else to_typ
    return func_body('ret.v{{i}} = ({typ2}){in0}.v{{i}};'. \
                     format(typ2=typ2, **fmtspec), to_typ)

# -----------------------------------------------------------------------------

def nbtrue1(typ):
    acc_code = repeat_stmt('acc += {in0}.v{{i}} == (u32)-1 ? 1 : 0;'. \
                           format(**fmtspec), typ)
    return '''int acc = 0;
              {acc_code}
              return acc;'''.format(acc_code=acc_code)

# -----------------------------------------------------------------------------

def reverse1(typ):
    n = get_nb_el(typ)
    content = '\n'.join('ret.v{i} = {in0}.v{j}'. \
                        format(i=i, j=n - i, **fmtspec) \
                        for i in range(0, n))
    return '''nsimd_cpu_v{typ} ret;
              {content}
              return ret;'''.format(content=content, **fmtspec)

# -----------------------------------------------------------------------------

def addv1(typ):
    content = '+'.join('{in0}.v{i}'.format(i=i, **fmtspec) \
                       for i in range(0, get_nb_el(typ)))
    if typ == 'f16':
        return 'return nsimd_f32_to_f16({});'.format(content)
    else:
        return 'return {};'.format(content)

# -----------------------------------------------------------------------------

def upcvt1(from_typ, to_typ):
    n = get_nb_el(to_typ)
    to_typ2 = 'f32' if to_typ == 'f16' else to_typ
    lower_half = '\n'.join('ret.v0.v{i} = ({to_typ2}){in0}.v{i};'. \
                           format(i=i, to_typ2=to_typ2, **fmtspec) \
                           for i in range(0, n))
    upper_half = '\n'.join('ret.v1.v{i} = ({to_typ2}){in0}.v{j};'. \
                           format(i=i, j=i + n, to_typ2=to_typ2, **fmtspec) \
                           for i in range(0, n))
    return '''nsimd_cpu_v{to_typ}x2 ret;
              {lower_half}
              {upper_half}
              return ret;'''.format(lower_half=lower_half,
                                    upper_half=upper_half, **fmtspec)

# -----------------------------------------------------------------------------

def downcvt2(from_typ, to_typ):
    n = get_nb_el(from_typ)
    to_typ2 = 'f32' if to_typ == 'f16' else to_typ
    lower_half = '\n'.join('ret.v{i} = ({to_typ2}){in0}.v{i};'. \
                           format(i=i, to_typ2=to_typ2, **fmtspec) \
                           for i in range(0, n))
    upper_half = '\n'.join('ret.v{j} = ({to_typ2}){in1}.v{i};'. \
                           format(i=i, j=i + n, to_typ2=to_typ2, **fmtspec) \
                           for i in range(0, n))
    return '''nsimd_cpu_v{to_typ} ret;
              {lower_half}
              {upper_half}
              return ret;'''.format(lower_half=lower_half,
                                    upper_half=upper_half, **fmtspec)

# -----------------------------------------------------------------------------

def len1(typ):
    return 'return {};'.format(get_nb_el(typ))

# -----------------------------------------------------------------------------

def to_logical1(typ):
    unsigned_to_logical = \
        'ret.v{{i}} = ({in0}.v{{i}} == ({utyp})0 ? (u32)0 : (u32)-1);'. \
        format(**fmtspec)
    if typ in common.utypes:
        return func_body(unsigned_to_logical, typ, True)
    else:
        unsigned_to_logical = \
            'ret.v{{i}} = (buf.v{{i}} == ({utyp})0 ? (u32)0 : (u32)-1);'. \
            format(**fmtspec)
        return '''nsimd_cpu_vl{typ} ret;
                  nsimd_cpu_vu{typnbits} buf;
                  buf = nsimd_reinterpret_cpu_u{typnbits}_{typ}({in0});
                  {unsigned_to_logical}
                  return ret;'''. \
                  format(unsigned_to_logical=repeat_stmt(unsigned_to_logical,
                                                         typ), **fmtspec)

# -----------------------------------------------------------------------------

def to_mask1(typ):
    logical_to_unsigned = \
        'ret.v{{i}} = ({utyp})({in0}.v{{i}} ? -1 : 0);'. \
        format(**fmtspec)
    if typ in common.utypes:
        return func_body(logical_to_unsigned, typ)
    elif typ == 'f16':
        return '''union {{ f32 f; u32 u; }} buf;
                  nsimd_cpu_vf16 ret;
                  {u32_to_f32}
                  return ret;'''. \
                  format(u32_to_f32=repeat_stmt(
                      'buf.u = {in0}.v{{i}}; ret.v{{i}} = buf.f;'. \
                      format(**fmtspec), 'f16'), **fmtspec)
    else:
        return '''nsimd_cpu_vu{typnbits} ret;
                  {logical_to_unsigned}
                  return nsimd_reinterpret_cpu_{typ}_u{typnbits}(ret);'''. \
                  format(logical_to_unsigned=repeat_stmt(logical_to_unsigned,
                                                         typ), **fmtspec)

# -----------------------------------------------------------------------------

def zip_half(func, typ):
    n = get_nb_el(typ)
    if func == "ziplo":
      content = '\n'.join('ret.v{j1} = {in0}.v{i}; ret.v{j2} = {in1}.v{i};'. \
                          format(i=i, j1=i*2, j2=i*2+1, **fmtspec) \
                          for i in range(0, int(n/2)))
    else :
      content = '\n'.join('ret.v{j1} = {in0}.v{i}; ret.v{j2} = {in1}.v{i};'. \
                          format(i=i+int(n/2), j1=i*2, j2=i*2+1, **fmtspec) \
                          for i in range(0, int(n/2)))

    return '''nsimd_cpu_v{typ} ret;
            {content}
            return ret;'''.format(content=content, **fmtspec)

# -----------------------------------------------------------------------------

def unzip_half(func, typ):
    n = get_nb_el(typ)
    content = ''
    if func == "unziplo":
        content = '\n'.join('ret.v{i} = {in0}.v{j}; '. \
                    format(i=i, j=i*2, **fmtspec) \
                    for i in range(0, int(n/2)))
        content = content + '\n'.join('ret.v{i} = {in1}.v{j}; '. \
                    format(i=i, j=2*(i-int(n/2)), **fmtspec) \
                    for i in range(int(n/2), n))
    else :
        content = '\n'.join('ret.v{i} = {in0}.v{j}; '. \
                    format(i=i, j=i*2+1, **fmtspec) \
                    for i in range(0, int(n/2)))
        content = content + '\n'.join('ret.v{i} = {in1}.v{j}; '. \
                    format(i=i, j=2*(i-int(n/2))+1, **fmtspec)\
                    for i in range(int(n/2), n))
    return '''nsimd_cpu_v{typ} ret;
              {content}
              return ret;'''.format(content=content, **fmtspec)

def zip(from_typ):
    return '''nsimd_{simd_ext}_v{typ}x2 ret;
              ret.v0 = nsimd_ziplo_cpu_{typ}({in0}, {in1});
              ret.v1 = nsimd_ziphi_cpu_{typ}({in0}, {in1});
              return ret;'''.format(**fmtspec)

def unzip(from_typ):
    return '''nsimd_{simd_ext}_v{typ}x2 ret;
              ret.v0 = nsimd_unziplo_cpu_{typ}({in0}, {in1});
              ret.v1 = nsimd_unziphi_cpu_{typ}({in0}, {in1});
              return ret;'''.format(**fmtspec)

# -----------------------------------------------------------------------------

def mask_for_loop_tail(typ):
    return func_body(
           'ret.v{{i}} = {in0} + {{i}} < {in1} ? (u32)-1 : (u32)0;'. \
           format(**fmtspec), typ, True)

# -----------------------------------------------------------------------------

def iota(typ):
    typ2 = 'f32' if typ == 'f16' else typ
    return func_body('ret.v{{i}} = ({typ2}){{i}};'. \
                     format(typ2=typ2, **fmtspec), typ)

# -----------------------------------------------------------------------------

def gather(typ):
    if typ == 'f16':
        return func_body(
               'ret.v{{i}} = nsimd_f16_to_f32({in0}[{in1}.v{{i}}]);'. \
               format(**fmtspec), typ)
    return func_body('ret.v{{i}} = {in0}[{in1}.v{{i}}];'. \
                     format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def gather_linear(typ):
    if typ == 'f16':
        return func_body(
               'ret.v{{i}} = nsimd_f16_to_f32({in0}[{{i}} * {in1}]);'. \
               format(**fmtspec), typ)
    return func_body('ret.v{{i}} = {in0}[{{i}} * {in1}];'. \
                     format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def maskoz_gather(op, typ):
    if typ == 'f16':
        oz = '0.0f' if op == 'z' else '{in3}.v{{i}}'
        return func_body(
               ('''if ({in0}.v{{i}}) {{{{
                     ret.v{{i}} = nsimd_f16_to_f32({in1}[{in2}.v{{i}}]);
                   }}}} else {{{{
                     ret.v{{i}} = ''' + oz + ''';
                   }}}}''').format(**fmtspec), typ)

    oz = '({typ})0' if op == 'z' else '{in3}.v{{i}}'
    return func_body(('''if ({in0}.v{{i}}) {{{{
                           ret.v{{i}} = {in1}[{in2}.v{{i}}];
                         }}}} else {{{{
                           ret.v{{i}} = ''' + oz + ''';
                         }}}}''').format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def scatter(typ):
    if typ == 'f16':
        return repeat_stmt(
               '{in0}[{in1}.v{{i}}] = nsimd_f32_to_f16({in2}.v{{i}});'. \
               format(**fmtspec), typ)
    return repeat_stmt('{in0}[{in1}.v{{i}}] = {in2}.v{{i}};'. \
                       format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def scatter_linear(typ):
    if typ == 'f16':
        return repeat_stmt(
               '{in0}[{{i}} * {in1}] = nsimd_f32_to_f16({in2}.v{{i}});'. \
               format(**fmtspec), typ)
    return repeat_stmt('{in0}[{{i}} * {in1}] = {in2}.v{{i}};'. \
                       format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def mask_scatter(typ):
    if typ == 'f16':
        return repeat_stmt(
               '''if ({in0}.v{{i}}) {{{{
                    {in1}[{in2}.v{{i}}] = nsimd_f32_to_f16({in3}.v{{i}});
                  }}}}'''.format(**fmtspec), typ)
    return repeat_stmt('''if ({in0}.v{{i}}) {{{{
                            {in1}[{in2}.v{{i}}] = {in3}.v{{i}};
                          }}}}'''.format(**fmtspec), typ)

# -----------------------------------------------------------------------------

def get_impl(opts, func, simd_ext, from_typ, to_typ=''):

    global fmtspec
    fmtspec = {
      'simd_ext': simd_ext,
      'typ': from_typ,
      'from_typ': from_typ,
      'to_typ': to_typ,
      'utyp': common.bitfield_type[from_typ],
      'in0': common.in0,
      'in1': common.in1,
      'in2': common.in2,
      'in3': common.in3,
      'in4': common.in4,
      'typnbits': from_typ[1:]
    }

    impls = {
        'loada': lambda: load(from_typ),
        'maskz_loada1': lambda: maskoz_load('z', from_typ),
        'masko_loada1': lambda: maskoz_load('o', from_typ),
        'load2a': lambda: load_deg234(from_typ, 2),
        'load3a': lambda: load_deg234(from_typ, 3),
        'load4a': lambda: load_deg234(from_typ, 4),
        'loadu': lambda: load(from_typ),
        'maskz_loadu1': lambda: maskoz_load('z', from_typ),
        'masko_loadu1': lambda: maskoz_load('o', from_typ),
        'load2u': lambda: load_deg234(from_typ, 2),
        'load3u': lambda: load_deg234(from_typ, 3),
        'load4u': lambda: load_deg234(from_typ, 4),
        'storea': lambda: store(from_typ),
        'mask_storea1': lambda: mask_store(from_typ),
        'store2a': lambda: store_deg234(from_typ, 2),
        'store3a': lambda: store_deg234(from_typ, 3),
        'store4a': lambda: store_deg234(from_typ, 4),
        'storeu': lambda: store(from_typ),
        'mask_storeu1': lambda: mask_store(from_typ),
        'store2u': lambda: store_deg234(from_typ, 2),
        'store3u': lambda: store_deg234(from_typ, 3),
        'store4u': lambda: store_deg234(from_typ, 4),
        'loadla': lambda: loadl(from_typ),
        'loadlu': lambda: loadl(from_typ),
        'gather': lambda: gather(from_typ),
        'gather_linear': lambda: gather_linear(from_typ),
        'maskz_gather': lambda: maskoz_gather('z', from_typ),
        'masko_gather': lambda: maskoz_gather('o', from_typ),
        'scatter': lambda: scatter(from_typ),
        'scatter_linear': lambda: scatter_linear(from_typ),
        'mask_scatter': lambda: mask_scatter(from_typ),
        'storela': lambda: storel(from_typ),
        'storelu': lambda: storel(from_typ),
        'add': lambda: op2('+', from_typ),
        'mul': lambda: op2('*', from_typ),
        'div': lambda: op2('/', from_typ),
        'sub': lambda: op2('-', from_typ),
        'adds' : lambda: scalar_impl('adds', from_typ, 2),
        'subs' : lambda: scalar_impl('subs', from_typ, 2),
        'orb': lambda: scalar_impl('orb', from_typ, 2),
        'orl': lambda: lop2('|', from_typ),
        'andb': lambda: scalar_impl('andb', from_typ, 2),
        'andnotb': lambda: scalar_impl('andnotb', from_typ, 2),
        'andnotl': lambda: landnot2(from_typ),
        'andl': lambda: lop2('&', from_typ),
        'xorb': lambda: scalar_impl('xorb', from_typ, 2),
        'xorl': lambda: lop2('^', from_typ),
        'min': lambda: scalar_impl('min', from_typ, 2),
        'max': lambda: scalar_impl('max', from_typ, 2),
        'notb': lambda: scalar_impl('notb', from_typ, 1),
        'notl': lambda: lnot1(from_typ),
        'sqrt': lambda: scalar_impl('sqrt', from_typ, 1),
        'set1': lambda: set1(from_typ),
        'set1l': lambda: set1l(from_typ),
        'shr': lambda: scalar_impl('shr', from_typ, 2),
        'shl': lambda: scalar_impl('shl', from_typ, 2),
        'shra': lambda: scalar_impl('shra', from_typ, 2),
        'eq': lambda: cmp2('==', from_typ),
        'ne': lambda: cmp2('!=', from_typ),
        'gt': lambda: cmp2('>', from_typ),
        'ge': lambda: cmp2('>=', from_typ),
        'lt': lambda: cmp2('<', from_typ),
        'le': lambda: cmp2('<=', from_typ),
        'len': lambda: len1(from_typ),
        'if_else1': lambda: if_else1(from_typ),
        'abs': lambda: scalar_impl('abs', from_typ, 1),
        'fma': lambda: scalar_impl('fma', from_typ, 3),
        'fnma': lambda: scalar_impl('fnma', from_typ, 3),
        'fms': lambda: scalar_impl('fms', from_typ, 3),
        'fnms': lambda: scalar_impl('fnms', from_typ, 3),
        'ceil': lambda: scalar_impl('ceil', from_typ, 1),
        'floor': lambda: scalar_impl('floor', from_typ, 1),
        'trunc': lambda: scalar_impl('trunc', from_typ, 1),
        'round_to_even': lambda: scalar_impl('round_to_even', from_typ, 1),
        'all': lambda: all_any(from_typ, 'all'),
        'any': lambda: all_any(from_typ, 'any'),
        'reinterpret': lambda: reinterpret1(from_typ, to_typ),
        'reinterpretl': lambda: reinterpretl1(from_typ, to_typ),
        'cvt': lambda: convert1(from_typ, to_typ),
        'rec11': lambda: scalar_impl('rec11', from_typ, 1),
        'rec8': lambda: scalar_impl('rec8', from_typ, 1),
        'rsqrt11': lambda: scalar_impl('rsqrt11', from_typ, 1),
        'rsqrt8': lambda: scalar_impl('rsqrt8', from_typ, 1),
        'rec': lambda: scalar_impl('rec', from_typ, 1),
        'neg': lambda: scalar_impl('neg', from_typ, 1),
        'nbtrue': lambda: nbtrue1(from_typ),
        'reverse': lambda: reverse1(from_typ),
        'addv': lambda: addv1(from_typ),
        'upcvt': lambda: upcvt1(from_typ, to_typ),
        'downcvt': lambda: downcvt2(from_typ, to_typ),
        'to_logical': lambda: to_logical1(from_typ),
        'to_mask': lambda: to_mask1(from_typ),
        'ziplo': lambda: zip_half('ziplo', from_typ),
        'ziphi': lambda: zip_half('ziphi', from_typ),
        'unziplo': lambda: unzip_half('unziplo', from_typ),
        'unziphi': lambda: unzip_half('unziphi', from_typ),
        'zip' : lambda : zip(from_typ),
        'unzip' : lambda : unzip(from_typ),
        'mask_for_loop_tail': lambda : mask_for_loop_tail(from_typ),
        'iota': lambda : iota(from_typ)
    }
    if simd_ext != 'cpu':
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if not from_typ in common.types:
        raise ValueError('Unknown from_type "{}"'.format(from_typ))
    if not func in impls:
        return common.NOT_IMPLEMENTED
    return impls[func]()


================================================
FILE: egg/platform_ppc.py
================================================
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# This file gives the implementation for the Power PC platform.
# This script tries to be as readable as possible. It implements VMX and VSX.

# Documentation found from:
# https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
# https://www.ibm.com/docs/en/xl-c-and-cpp-linux/13.1.6?topic=functions-vector-built-in
# https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html

import common

fmtspec = {}

# -----------------------------------------------------------------------------
# Helpers

def has_to_be_emulated(simd_ext, typ):
    if typ == 'f16':
        return True
    if simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']:
        return True
    return False

# Returns the power pc type corresponding to the nsimd type
def native_type(typ):
    if typ == 'u8':
        return '__vector unsigned char'
    elif typ == 'i8':
        return '__vector signed char'
    elif typ == 'u16':
        return '__vector unsigned short'
    elif typ == 'i16':
        return '__vector signed short'
    elif typ == 'u32':
        return '__vector unsigned int'
    elif typ == 'u64':
        return '__vector unsigned long long'
    elif typ == 'i32':
        return '__vector signed int'
    elif typ == 'i64':
        return '__vector signed long long'
    elif typ == 'f32':
        return '__vector float'
    elif typ == 'f64':
        return '__vector double'
    else:
        raise ValueError('Type "{}" not supported'.format(typ))

# Returns the logical power pc type corresponding to the nsimd type
def native_typel(typ):
    if typ in ['i8', 'u8']:
        return '__vector __bool char'
    elif typ in ['i16', 'u16']:
        return '__vector __bool short'
    elif typ in ['i32', 'u32', 'f32']:
        return '__vector __bool int'
    elif typ in ['f64', 'i64', 'u64']:
        return '__vector __bool long long'
    else:
        raise ValueError('Type "{}" not supported'.format(typ))

# Length of a vector with elements of type typ
def get_len(typ):
    return 128 // int(typ[1:])

# Emulate 64 bits types for vmx only
def emulate_64(op, typ, params):
    def arg(param, i):
        if param == 'v':
            return '{}.v{{i}}'.format(common.get_arg(i))
        elif param == 'l':
            return '(int)({}.v{{i}} & ((u64)1))'.format(common.get_arg(i))
        else:
            return common.get_arg(i)
    args = ', '.join(arg(params[i + 1], i) for i in range(len(params[1:])))
    args0 = args.format(i=0)
    args1 = args.format(i=1)
    if params[0] == 'v':
        return '''nsimd_vmx_v{typ} ret;
                  ret.v0 = nsimd_scalar_{op}_{typ}({args0});
                  ret.v1 = nsimd_scalar_{op}_{typ}({args1});
                  return ret;'''. \
                  format(typ=typ, op=op, args0=args0, args1=args1)
    else:
        return \
        '''nsimd_vmx_vl{typ} ret;
           ret.v0 = (u64)(nsimd_scalar_{op}{suf}({args0}) ? -1 : 0);
           ret.v1 = (u64)(nsimd_scalar_{op}{suf}({args1}) ? -1 : 0);
           return ret;'''. \
           format(suf='' if params == ['l'] * len(params) else '_' + typ,
                  typ=typ, op=op, args0=args0, args1=args1)

def emulate_f16(op, simd_ext, params):
    tmpl = ', '.join(['{{in{}}}.v{{{{i}}}}'.format(i).format(**fmtspec) \
                      for i in range(len(params[1:]))])
    args1 = tmpl.format(i=0)
    args2 = tmpl.format(i=1)
    l = 'l' if params[0] == 'l' else ''
    return '''nsimd_{simd_ext}_v{l}f16 ret;
              ret.v0 = nsimd_{op}_{simd_ext}_f32({args1});
              ret.v1 = nsimd_{op}_{simd_ext}_f32({args2});
              return ret;'''. \
              format(l=l, op=op, args1=args1, args2=args2, **fmtspec)

def emulation_code(op, simd_ext, typ, params):
    if typ == 'f16':
        return emulate_f16(op, simd_ext, params)
    elif simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']:
        return emulate_64(op, typ, params)
    else:
        raise ValueError('Automatic emulation for {}/{}/{} is not supported'. \
                         format(func, simd_ext, typ))

def emulate_with_scalar(op, simd_ext, typ, params):
    def arg(param, i):
        if param == 'v':
            return 'vec_extract({}, {{i}})'.format(common.get_arg(i))
        elif param == 'l':
            return '(int)(vec_extract({}, {{i}}) & ((u{})1))'. \
                   format(common.get_arg(i), typ[1:])
        else:
            return common.get_arg(i)
    args = ', '.join(arg(params[i + 1], i) for i in range(len(params[1:])))
    if params[0] == 'v':
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret = vec_splats(nsimd_scalar_{op}_{typ}({args0}));
                  '''.format(typ=typ, op=op, args0=args.format(i=0),
                             simd_ext=simd_ext) + '\n' + \
               '\n'.join('ret = vec_insert('\
                         'nsimd_scalar_{op}_{typ}({argsi}), ret, {i});'. \
                         format(op=op, typ=typ, argsi=args.format(i=i), i=i) \
                         for i in range(1, get_len(typ))) + '\nreturn ret;'
    else:
        utyp = 'u' + typ[1:]
        return \
        '''nsimd_{simd_ext}_vl{typ} ret;
           ret = ({ppc_typl})vec_splats(({utyp})(
                     nsimd_scalar_{op}_{typ}({args0}) ? -1 : 0));
           '''.format(typ=typ, op=op, args0=args.format(i=0), utyp=utyp,
                      ppc_typl=native_typel(typ), simd_ext=simd_ext) + '\n' + \
           '\n'.join(
               'ret = ({ppc_typl})vec_insert(({utyp})(' \
               'nsimd_scalar_{op}_{typ}({argsi}) ? -1 : 0), ret, {i});'. \
               format(op=op, typ=typ, utyp=utyp, argsi=args.format(i=i),
                      ppc_typl=native_typel(typ), i=i) \
               for i in range(1, get_len(typ))) + '\nreturn ret;'

# -----------------------------------------------------------------------------
# Implementation of mandatory functions for this module

def emulate_fp16(simd_ext):
    return True

def get_simd_exts():
    return ['vmx', 'vsx']

def get_type(opts, simd_ext, typ, nsimd_typ):
    if simd_ext not in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    if typ == 'f16':
        struct = 'struct {__vector float v0; __vector float v1;}'
    elif simd_ext == 'vmx' and typ in ['i64', 'u64', 'f64']:
        struct = 'struct {{ {} v0; {} v1; }}'.format(typ, typ)
    else:
        struct = native_type(typ)
    return 'typedef {} {};'.format(struct, nsimd_typ)

def get_logical_type(opts, simd_ext, typ, nsimd_typ):
    if simd_ext not in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    if typ == 'f16':
        struct = 'struct {__vector __bool int v0; __vector __bool int v1;}'
    elif simd_ext == 'vmx' and typ in ['i64', 'u64', 'f64']:
        struct = 'struct { u64 v0; u64 v1; }'
    else:
        struct = native_typel(typ)
    return 'typedef {} {};'.format(struct, nsimd_typ)

def get_nb_registers(simd_ext):
    if simd_ext == 'vsx':
        return '64'
    elif simd_ext == 'vmx':
        return '32'
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

def has_compatible_SoA_types(simd_ext):
    if simd_ext in get_simd_exts():
        return False
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))

def get_additional_include(func, platform, simd_ext):
    ret = '''#include <nsimd/cpu/cpu/{}.h>
             '''.format(func)
    if simd_ext == 'vsx':
        ret += '''#include <nsimd/ppc/vmx/{}.h>
                  '''.format(func)

    if func == 'neq':
        ret += '''#include <nsimd/ppc/{simd_ext}/eq.h>
                  #include <nsimd/ppc/{simd_ext}/notl.h>
                  '''.format(simd_ext=simd_ext)

    elif func in ['loadlu', 'loadla']:
        ret += '''#include <nsimd/ppc/{simd_ext}/eq.h>
                  #include <nsimd/ppc/{simd_ext}/set1.h>
                  #include <nsimd/ppc/{simd_ext}/{load}.h>
                  #include <nsimd/ppc/{simd_ext}/notl.h>
                  '''.format(load='load' + func[5], **fmtspec)

    elif func in ['storelu']:
        ret += '''#include <nsimd/ppc/{simd_ext}/if_else1.h>
                  #include <nsimd/ppc/{simd_ext}/set1.h>
                  '''.format(**fmtspec)

    elif func in ['shr', 'shl']:
        ret += '''#include <nsimd/ppc/{simd_ext}/set1.h>
                  '''.format(**fmtspec)

    elif func == "shra":
        ret += '''#include <nsimd/scalar_utilities.h>
                  '''

    elif func in ['zip', 'unzip']:
        ret += '''#include <nsimd/ppc/{simd_ext}/{unzip_prefix}ziplo.h>
                  #include <nsimd/ppc/{simd_ext}/{unzip_prefix}ziphi.h>
                  '''.format(unzip_prefix="" if func == "zip" else "un",
                             **fmtspec)

    elif func in ['unziplo', 'unziphi']:
        ret += '''#include <nsimd/ppc/{simd_ext}/ziplo.h>
                  #include <nsimd/ppc/{simd_ext}/ziphi.h>
                  #include <math.h>
                  '''.format(**fmtspec)

    elif func[:5] in ['masko', 'maskz']:
        ret += '''#include <nsimd/scalar_utilities.h>
                  '''

    elif func == 'mask_for_loop_tail':
        ret += '''#include <nsimd/ppc/{simd_ext}/set1.h>
                  #include <nsimd/ppc/{simd_ext}/set1l.h>
                  #include <nsimd/ppc/{simd_ext}/iota.h>
                  #include <nsimd/ppc/{simd_ext}/lt.h>
                  '''.format(simd_ext=simd_ext)

    elif func[:4] == 'load':
        ret += '''
        #include <nsimd/ppc/{simd_ext}/unzip.h>

        #define NSIMD_PERMUTE_MASK_64(a, b)                        \
                {{ (unsigned char)(8 * a), (unsigned char)(8 * a + 1), \
                   (unsigned char)(8 * b), (unsigned char)(8 * b + 1) }}


        #define NSIMD_PERMUTE_MASK_32(a, b, c, d)                        \
                {{ (unsigned char)(4 * a), (unsigned char)(4 * a + 1),     \
                   (unsigned char)(4 * a + 2), (unsigned char)(4 * a + 3),  \
                   (unsigned char)(4 * b), (unsigned char)(4 * b + 1),      \
                   (unsigned char)(4 * b + 2), (unsigned char)(4 * b + 3),  \
                   (unsigned char)(4 * c), (unsigned char)(4 * c + 1),      \
                   (unsigned char)(4 * c + 2), (unsigned char)(4 * c + 3),  \
                   (unsigned char)(4 * d), (unsigned char)(4 * d + 1),      \
                   (unsigned char)(4 * d + 2), (unsigned char)(4 * d + 3) }}

         #define NSIMD_PERMUTE_MASK_16(a, b, c, d, e, f, g, h)           \
               {{ (unsigned char)(2 * a + 0), (unsigned char)(2 * a + 1),  \
                  (unsigned char)(2 * b + 0), (unsigned char)(2 * b + 1),  \
                  (unsigned char)(2 * c + 0), (unsigned char)(2 * c + 1),  \
                  (unsigned char)(2 * d + 0), (unsigned char)(2 * d + 1),  \
                  (unsigned char)(2 * e + 0), (unsigned char)(2 * e + 1),  \
                  (unsigned char)(2 * f + 0), (unsigned char)(2 * f + 1),  \
                  (unsigned char)(2 * g + 0), (unsigned char)(2 * g + 1),  \
                  (unsigned char)(2 * h + 0), (unsigned char)(2 * h + 1) }}

         #define NSIMD_PERMUTE_MASK_8(a, b, c, d, e, f, g, h,            \
                                      i, j, k, l, m, n, o, p)            \
              {{ (unsigned char)(a), (unsigned char)(b),                  \
                 (unsigned char)(c), (unsigned char)(d),                  \
                 (unsigned char)(e), (unsigned char)(f),                  \
                 (unsigned char)(g), (unsigned char)(h),                  \
                 (unsigned char)(i), (unsigned char)(j),                  \
                 (unsigned char)(k), (unsigned char)(l),                  \
                 (unsigned char)(m), (unsigned char)(n),                  \
                 (unsigned char)(o), (unsigned char)(p) }}
        '''.format(**fmtspec)

    return ret

# -----------------------------------------------------------------------------

def printf2(*args0):
    """
    debugging purposes
    decorate the function with it and when executed on test, it will print the
    environnements *args0 are the name of var to printf
    """
    to_print = []
    for arg in args0:
        if isinstance(arg, str):
            to_print.append(arg)

    def decorator(func):
        import inspect

        def wrapper(*args, **kwargs):
            func_args = inspect.signature(func).bind(*args, **kwargs).arguments
            func_args_str = '{} called on {}\\n'. \
                            format(func.__name__, fmtspec['typ']) + \
                            ', "'.join('{} = {!r}'.format(*item) \
                                       for item in func_args.items())
            ret = ''
            if not DEBUG:
                return func(*args)
            typ = ''
            if 'typ' in func_args:
                typ = func_args['typ']
            else:
                typ = func_args['from_typ']
            ret += 'int k;\n'
            if func.__name__ == 'store1234' and typ in ['f64', 'i64', 'u64']:
                ret += '''
                       printf("element to store: %ld %ld", {in1}{suf0},
                              {in1}{suf1});
                       printf("\\n");
                       '''.format(**fmtspec, **get_suf64(typ))
            elif func.__name__ == 'store1234' and typ[1:] == '32':
                ret += '''
                       printf("element to store:");
                       for (k = 0; k < 4; k++) {{
                         printf(" %lx", {in1}[k]);
                       }}
                       printf("\\n");
                       '''.format(**fmtspec, nbits=get_len(typ))
            #print var passed as parameter on printf2
            for var in to_print:
                if ppc_is_vec_type(typ):
                    ret += '''
                           printf("values of {var}:");
                           for (k = 0; k < {nbits}; k++) {{
                             printf(" %lld", {var}[k]);
                           }}
                           printf("\\n");
                           '''.format(var=var, **fmtspec, nbits=get_len(typ))
            return '''
                   printf("\\n---------------\\n");
                   printf("{}.{} ( {} )\\n");
                   '''.format(func.__module__, func.__qualname__,
                              func_args_str) + ret + func(*args)

        return wrapper

    return decorator


# -----------------------------------------------------------------------------
# Loads of degree 1, 2, 3 and 4
# About unaligned loads/stores for Altivec:
# https://developer.ibm.com/technologies/systems/articles/pa-dalign/

def load1234(simd_ext, typ, deg, aligned):
    if typ in ['f64', 'i64', 'u64']:
        if deg == 1:
            if simd_ext == 'vmx':
                return '''nsimd_{simd_ext}_v{typ} ret;
                          ret.v0 = {in0}[0];
                          ret.v1 = {in0}[1];
                          return ret;'''.format(**fmtspec)
            else:
                return '''nsimd_{simd_ext}_v{typ} ret;
                          ret = vec_splats({in0}[0]);
                          ret = vec_insert({in0}[1], ret, 1);
                          return ret;'''.format(**fmtspec)
        else:
            if simd_ext == 'vmx':
                return \
                'nsimd_{simd_ext}_v{typ}x{} ret;\n'.format(deg, **fmtspec) + \
                '\n'.join(['ret.v{i}.v0 = *({in0} + {i});'. \
                           format(i=i, **fmtspec) \
                           for i in range(0, deg)]) + \
                '\n'.join(['ret.v{i}.v1 = *({in0} + {ipd});'. \
                           format(i=i, ipd=i + deg, **fmtspec) \
                           for i in range(0, deg)]) + \
                '\nreturn ret;'
            else:
                return \
                'nsimd_{simd_ext}_v{typ}x{} ret;\n'.format(deg, **fmtspec) + \
                '\n'.join(
                'ret.v{i} = vec_splats({in0}[{i}]);'.format(i=i, **fmtspec) \
                for i in range(0, deg)) + \
                '\n'.join(
                'ret.v{i} = vec_insert({in0}[{ipd}], ret.v{i}, 1);'. \
                format(i=i, ipd=i + deg, **fmtspec) for i in range(0, deg)) + \
                '\nreturn ret;'
    if typ == 'f16':
        if deg == 1:
            return \
            '''nsimd_{simd_ext}_vf16 ret;
               u16 *ptr = (u16 *){in0};
               ret.v0 = vec_splats(nsimd_u16_to_f32(ptr[0]));
               ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[1]), ret.v0, 1);
               ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[2]), ret.v0, 2);
               ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[3]), ret.v0, 3);
               ret.v1 = vec_splats(nsimd_u16_to_f32(ptr[4]));
               ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[5]), ret.v1, 1);
               ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[6]), ret.v1, 2);
               ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[7]), ret.v1, 3);
               return ret;'''.format(**fmtspec)
        else:
            ret = '''nsimd_{simd_ext}_vf16x{deg} ret;
                     u16 *ptr = (u16 *){in0};
                     '''.format(deg=deg, **fmtspec)

            for i in range(0, deg):
                for k in range(0, 2):
                    ret += 'ret.v{}.v{} = vec_splats(' \
                           'nsimd_u16_to_f32(ptr[{}]));\n'. \
                           format(i, k, i + k * 4 * deg)
                    for j in range(1, 4):
                        ret += 'ret.v{i}.v{k} = vec_insert(nsimd_u16_to_f32(' \
                               'ptr[{o}]), ret.v{i}.v{k}, {j});\n'. \
                               format(i=i, k=k, j=j,
                                      o=i + k * 4 * deg + j * deg)
            ret += 'return ret;'
            return ret
    if deg == 1:
        if aligned:
            return 'return vec_ld(0, {in0});'.format(**fmtspec)
        else:
            return 'return *({ppc_typ}*){in0};'. \
                   format(ppc_typ=native_type(typ), **fmtspec)

    # From here deg >= 2

    if aligned:
        load = 'nsimd_{simd_ext}_v{typ}x{deg} ret;\n'. \
               format(deg=deg, **fmtspec) + \
               '\n'.join(
                 'nsimd_{simd_ext}_v{typ} in{i} = vec_ld({o}, {in0});'. \
                 format(i=i, o=i * 16, **fmtspec) for i in range(deg))
    else:
        load = \
        'nsimd_{simd_ext}_v{typ}x{deg} ret;\n'. \
        format(deg=deg, **fmtspec) + \
        '\n'.join(
          'nsimd_{simd_ext}_v{typ} in{i} = *(({ppc_typ}*){in0} + {i});'. \
          format(i=i, ppc_typ=native_type(typ), **fmtspec) \
                 for i in range(0, deg))
    if deg == 2:
        return '''{load}
                  ret = nsimd_unzip_{simd_ext}_{typ}(in0, in1);
                  return ret;'''.format(load=load, **fmtspec)
    elif deg == 3:
        if typ in ['i32', 'u32', 'f32']:
            return \
            '''__vector unsigned char perm1 = NSIMD_PERMUTE_MASK_32(
                                                  0, 3, 6, 0);

               {load}

               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, perm1);
               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in1, in2, perm1);
               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in2, in0, perm1);

               __vector unsigned char perm2 = NSIMD_PERMUTE_MASK_32(
                                                  0, 1, 2, 5);
               __vector unsigned char perm3 = NSIMD_PERMUTE_MASK_32(
                                                  5, 0, 1, 2);
               __vector unsigned char perm4 = NSIMD_PERMUTE_MASK_32(
                                                  2, 5, 0, 1);

               ret.v0 = vec_perm(tmp0, in2, perm2);
               ret.v1 = vec_perm(tmp1, in0, perm3);
               ret.v2 = vec_perm(tmp2, in1, perm4);

               return ret;'''.format(load=load, **fmtspec)
        elif typ in ['i16', 'u16']:
            return \
            '''{load}

               __vector unsigned char permRAB = NSIMD_PERMUTE_MASK_16(
                                           0, 3, 6, 9, 12, 15, 0, 0);
               __vector unsigned char permRDC = NSIMD_PERMUTE_MASK_16(
                                           0, 1, 2, 3, 4, 5, 10, 13);

               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, permRAB);
               ret.v0 = vec_perm(tmp0, in2, permRDC);

               __vector unsigned char permGAB = NSIMD_PERMUTE_MASK_16(
                                           1, 4, 7, 10, 13, 0, 0, 0);
               __vector unsigned char permGEC = NSIMD_PERMUTE_MASK_16(
                                           0, 1, 2, 3, 4, 8, 11, 14);

               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in0, in1, permGAB);
               ret.v1 = vec_perm(tmp1, in2, permGEC);

               __vector unsigned char permBAB = NSIMD_PERMUTE_MASK_16(
                                           2, 5, 8, 11, 14, 0, 0, 0);
               __vector unsigned char permBFC = NSIMD_PERMUTE_MASK_16(
                                           0, 1, 2, 3, 4, 9, 12, 15);

               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in0, in1, permBAB);
               ret.v2 = vec_perm(tmp2, in2, permBFC);

               return ret;'''.format(load=load, **fmtspec)
        elif typ in ['i8', 'u8']:
            return \
            '''{load}

               __vector unsigned char permRAB = NSIMD_PERMUTE_MASK_8(
                   0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0);
               __vector unsigned char permRDC = NSIMD_PERMUTE_MASK_8(
                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29);

               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, permRAB);
               ret.v0 = vec_perm(tmp0, in2, permRDC);

               __vector unsigned char permGAB = NSIMD_PERMUTE_MASK_8(
                   1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0);
               __vector unsigned char permGEC = NSIMD_PERMUTE_MASK_8(
                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30);

               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in0, in1, permGAB);
               ret.v1 = vec_perm(tmp1, in2, permGEC);

               __vector unsigned char permBAB = NSIMD_PERMUTE_MASK_8(
                   2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0);
               __vector unsigned char permBFC = NSIMD_PERMUTE_MASK_8(
                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31);

               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in0, in1, permBAB);
               ret.v2 = vec_perm(tmp2, in2, permBFC);

               return ret;'''.format(load=load, **fmtspec)
    else:
        if typ in ['i32', 'u32', 'f32']:
            return \
            '''{load}

               nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(in0, in2);
               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(in0, in2);
               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(in1, in3);
               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(in1, in3);

               ret.v0 = vec_mergeh(tmp0, tmp2);
               ret.v1 = vec_mergel(tmp0, tmp2);
               ret.v2 = vec_mergeh(tmp1, tmp3);
               ret.v3 = vec_mergel(tmp1, tmp3);

               return ret;'''.format(load=load, **fmtspec)
        elif typ in ['i16', 'u16']:
            return \
            '''{load}

               ret.v0 = vec_mergeh(in0, in2);
               ret.v1 = vec_mergel(in0, in2);
               ret.v2 = vec_mergeh(in1, in3);
               ret.v3 = vec_mergel(in1, in3);

               nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(ret.v0, ret.v2);
               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(ret.v0, ret.v2);
               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(ret.v1, ret.v3);
               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(ret.v1, ret.v3);

               ret.v0 = vec_mergeh(tmp0, tmp2);
               ret.v1 = vec_mergel(tmp0, tmp2);
               ret.v2 = vec_mergeh(tmp1, tmp3);
               ret.v3 = vec_mergel(tmp1, tmp3);

               return ret;'''.format(load=load, **fmtspec)
        elif typ in ['i8', 'u8']:
            return \
            '''{load}

               nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(in0, in2);
               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(in0, in2);
               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(in1, in3);
               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(in1, in3);

               ret.v0 = vec_mergeh(tmp0, tmp2);
               ret.v1 = vec_mergel(tmp0, tmp2);
               ret.v2 = vec_mergeh(tmp1, tmp3);
               ret.v3 = vec_mergel(tmp1, tmp3);

               tmp0 = vec_mergeh(ret.v0, ret.v2);
               tmp1 = vec_mergel(ret.v0, ret.v2);
               tmp2 = vec_mergeh(ret.v1, ret.v3);
               tmp3 = vec_mergel(ret.v1, ret.v3);

               ret.v0 = vec_mergeh(tmp0, tmp2);
               ret.v1 = vec_mergel(tmp0, tmp2);
               ret.v2 = vec_mergeh(tmp1, tmp3);
               ret.v3 = vec_mergel(tmp1, tmp3);

               return ret;'''.format(load=load, **fmtspec)

# -----------------------------------------------------------------------------
# Stores of degree 1, 2, 3 and 4

def store1234(simd_ext, typ, deg, aligned):
    if typ in ['f64', 'i64', 'u64']:
        if simd_ext == 'vmx':
            return '\n'.join('{}[{}] = {}.v0;'. \
                             format(common.in0, i, common.get_arg(i + 1)) \
                             for i in range(deg)) + '\n' + \
                   '\n'.join('{}[{}] = {}.v1;'. \
                             format(common.in0, i + deg,
                                    common.get_arg(i + 1)) for i in range(deg))
        else:
            return '\n'.join('{}[{}] = vec_extract({}, 0);'. \
                             format(common.in0, i, common.get_arg(i + 1)) \
                             for i in range(deg)) + '\n' + \
                   '\n'.join('{}[{}] = vec_extract({}, 1);'. \
                             format(common.in0, i + deg,
                                    common.get_arg(i + 1)) for i in range(deg))
    if typ == 'f16':
        if deg == 1:
            return \
            '''u16 *ptr = (u16 *){in0};
               ptr[0] = nsimd_f32_to_u16(vec_extract({in1}.v0, 0));
               ptr[1] = nsimd_f32_to_u16(vec_extract({in1}.v0, 1));
               ptr[2] = nsimd_f32_to_u16(vec_extract({in1}.v0, 2));
               ptr[3] = nsimd_f32_to_u16(vec_extract({in1}.v0, 3));
               ptr[4] = nsimd_f32_to_u16(vec_extract({in1}.v1, 0));
               ptr[5] = nsimd_f32_to_u16(vec_extract({in1}.v1, 1));
               ptr[6] = nsimd_f32_to_u16(vec_extract({in1}.v1, 2));
               ptr[7] = nsimd_f32_to_u16(vec_extract({in1}.v1, 3));'''. \
               format(**fmtspec)
        else:
            ret = 'u16 *ptr = (u16 *){in0};\n'.format(**fmtspec)
            for i in range(0, deg):
                for k in range(0, 2):
                    for j in range(0, 4):
                        ret += 'ptr[{o}] = nsimd_f32_to_u16(' \
                               'vec_extract({a}.v{k}, {j}));\n'. \
                               format(a=common.get_arg(i + 1), j=j, k=k,
                                      o=i + k * 4 * deg + j * deg, **fmtspec)
            return ret
    if deg == 1:
        if aligned:
            return 'vec_st({in1}, 0, {in0});'.format(**fmtspec)
        else:
            return '*({ppc_typ} *){in0} = {in1};'. \
                   format(ppc_typ=native_type(typ), **fmtspec)

    # From here deg >= 2

    if aligned:
        store = '\n'.join('vec_st(ret{i}, {o}, {in0});'. \
                          format(i=i, o=i * 16, **fmtspec) \
                          for i in range(0, deg))
    else:
        store = '\n'.join('*({ppc_typ} *)({in0} + {o}) = ret{i};'. \
                          format(o=i * get_len(typ), ppc_typ=native_type(typ),
                                 i=i, **fmtspec) for i in range(deg))
    if deg == 2:
        return \
        '''nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh({in1}, {in2});
           nsimd_{simd_ext}_v{typ} ret1 = vec_mergel({in1}, {in2});

           {store}'''.format(store=store, **fmtspec)
    elif deg == 3:
        if typ in ['i32', 'u32', 'f32']:
            return \
            '''__vector unsigned char perm1 = NSIMD_PERMUTE_MASK_32(
                                                  0, 2, 4, 6);
               __vector unsigned char perm2 = NSIMD_PERMUTE_MASK_32(
                                                  0, 2, 5, 7);
               __vector unsigned char perm3 = NSIMD_PERMUTE_MASK_32(
                                                  1, 3, 5, 7);

               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, perm1);
               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in3}, {in1}, perm2);
               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in2}, {in3}, perm3);

               nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, tmp1, perm1);
               nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp2, tmp0, perm2);
               nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp1, tmp2, perm3);

               {store}'''.format(store=store, **fmtspec)
        elif typ in ['i16', 'u16']:
            return \
            '''__vector unsigned char permARG = NSIMD_PERMUTE_MASK_16(
                                           0, 8, 0, 1, 9, 0, 2, 10);
               __vector unsigned char permAXB = NSIMD_PERMUTE_MASK_16(
                                           0, 1, 8, 3, 4, 9, 6, 7);

               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, permARG);
               nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, {in3}, permAXB);

               __vector unsigned char permBRG = NSIMD_PERMUTE_MASK_16(
                                           0, 3, 11, 0, 4, 12, 0, 5);
               __vector unsigned char permBYB = NSIMD_PERMUTE_MASK_16(
                                           10, 1, 2, 11, 4, 5, 12, 7);

               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in1}, {in2}, permBRG);
               nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp1, {in3}, permBYB);

               __vector unsigned char permCRG = NSIMD_PERMUTE_MASK_16(
                                           13, 0, 6, 14, 0, 7, 15, 0);
               __vector unsigned char permCZB = NSIMD_PERMUTE_MASK_16(
                                           0, 13, 2, 3, 14, 5, 6, 15);

               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in1}, {in2}, permCRG);
               nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp2, {in3}, permCZB);

               {store}'''.format(store=store, **fmtspec)
        elif typ in ['i8', 'u8']:
            return \
            '''__vector unsigned char mARG = NSIMD_PERMUTE_MASK_8(
                   0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5);
               __vector unsigned char mAXB = NSIMD_PERMUTE_MASK_8(
                   0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15);

               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, mARG);
               nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, {in3}, mAXB);

               __vector unsigned char mBRG = NSIMD_PERMUTE_MASK_8(
                   21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26);
               __vector unsigned char mBYB = NSIMD_PERMUTE_MASK_8(
                   0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15);

               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in1}, {in2}, mBRG);
               nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp1, {in3}, mBYB);

               __vector unsigned char mCRG = NSIMD_PERMUTE_MASK_8(
                   0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0);
               __vector unsigned char mCZB = NSIMD_PERMUTE_MASK_8(
                   26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31);

               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in1}, {in2}, mCRG);
               nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp2, {in3}, mCZB);

               {store}'''.format(store=store, **fmtspec)
    else:
        if typ in ['i32', 'u32', 'f32']:
            return \
            '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3});
               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3});
               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4});
               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4});

               nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2);
               nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2);
               nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3);
               nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3);

               {store}'''.format(store=store, **fmtspec)
        elif typ in ['i16', 'u16']:
            return \
            '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3});
               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3});
               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4});
               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4});

               nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2);
               nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2);
               nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3);
               nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3);

               {store}'''.format(store=store, **fmtspec)
        elif typ in ['i8', 'u8']:
            return \
            '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3});
               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3});
               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4});
               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4});

               nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2);
               nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2);
               nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3);
               nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3);

               {store}'''.format(store=store, **fmtspec)

# -----------------------------------------------------------------------------
# Length

def len1(simd_ext, typ):
    return 'return {};'.format(128 // int(typ[1:]))

# -----------------------------------------------------------------------------
# Other helper functions

def simple_op2(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])
    return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec)

# Binary operators: and, or, xor, andnot
def binary_op2(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])
    else:
        ppcop = {'orb': 'or', 'xorb': 'xor', 'andb': 'and', 'andnotb': 'andc'}
        return 'return vec_{op}({in0}, {in1});'.format(op=ppcop[op], **fmtspec)

# Logical operators: and, or, xor, andnot
def logical_op2(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['l', 'l', 'l'])
    ppcop = {'orl': 'or', 'xorl': 'xor', 'andl': 'and', 'andnotl': 'andc'}
    return 'return vec_{op}({in0}, {in1});'.format(op=ppcop[op], **fmtspec)

# -----------------------------------------------------------------------------

def div2(simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code('div', simd_ext, typ, ['v', 'v', 'v'])
    elif typ in common.ftypes:
        return 'return vec_div({in0}, {in1});'.format(**fmtspec)
    elif typ in common.iutypes:
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret = vec_splats(({typ})(vec_extract({in0}, 0) /
                                   vec_extract({in1}, 0)));
                  '''.format(**fmtspec) + \
               '\n'.join(
               '''ret = vec_insert(({typ})(vec_extract({in0}, {i}) /
                                   vec_extract({in1}, {i})), ret, {i});'''. \
                                   format(i=i, **fmtspec) \
                                   for i in range(get_len(typ))) + \
               '\nreturn ret;'

# -----------------------------------------------------------------------------

def not1(simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code('notb', simd_ext, typ, ['v', 'v'])
    return 'return vec_nor({in0}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def lnot1(simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code('notl', simd_ext, typ, ['l', 'l'])
    return 'return vec_nor({in0}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def sqrt1(simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code('sqrt', simd_ext, typ, ['v', 'v'])
    return 'return vec_sqrt({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def shift2(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v', 'p'])
    return 'return vec_{ppcop}({in0}, vec_splats((u{typnbits}){in1}));'. \
           format(ppcop={'shl': 'sl', 'shr': 'sr', 'shra': 'sra'}[op],
                  **fmtspec)

# -----------------------------------------------------------------------------

def set1(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  f32 tmp = nsimd_f16_to_f32({in0});
                  ret.v0 = vec_splats(tmp);
                  ret.v1 = ret.v0;
                  return ret;'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {in0};
                  ret.v1 = {in0};
                  return ret;'''.format(**fmtspec)
    else:
        return 'return vec_splats({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def lset1(simd_ext, typ):
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vlf16 ret;
           ret.v0 = (__vector __bool int)vec_splats((u32)({in0} ? -1 : 0));
           ret.v1 = ret.v0;
           return ret;'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_{simd_ext}_vl{typ} ret;
                  ret.v0 = (u64)({in0} ? -1 : 0);
                  ret.v1 = (u64)({in0} ? -1 : 0);
                  return ret;'''.format(**fmtspec)
    else:
        return '''if ({in0}) {{
                    return ({ppc_typ})vec_splats((u{typnbits})-1);
                  }} else {{
                    return {lzeros};
                  }}'''.format(ppc_typ=native_typel(typ), **fmtspec)

# -----------------------------------------------------------------------------

def cmp2(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['l', 'v', 'v'])
    elif typ in common.iutypes:
        if op == 'ne':
            return '''nsimd_{simd_ext}_vl{typ} tmp;
                      tmp = vec_cmpeq({in0}, {in1});
                      return vec_nor(tmp, tmp);'''.format(op=op, **fmtspec)
        else:
            return 'return vec_cmp{op}({in0}, {in1});'.format(op=op, **fmtspec)
    else:
        return emulate_with_scalar(op, simd_ext, typ, ['l', 'v', 'v'])

# -----------------------------------------------------------------------------

def if_else3(simd_ext, typ):
    if typ == 'f16':
        return emulate_f16('if_else1', simd_ext, ['v', 'l', 'v', 'v'])
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = ({in0}.v0 ? {in1}.v0 : {in2}.v0);
                  ret.v1 = ({in0}.v1 ? {in1}.v1 : {in2}.v1);
                  return ret;'''.format(**fmtspec)
    return 'return vec_sel({in2}, {in1}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def minmax2(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])
    return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec)

# -----------------------------------------------------------------------------

def abs1(simd_ext, typ):
    if typ in common.utypes:
        return 'return {in0};'.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return emulation_code('abs', simd_ext, typ, ['v', 'v'])
    return 'return vec_abs({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def round1(op, simd_ext, typ):
    if typ in common.iutypes:
        return 'return {in0};'.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v'])
    if op == 'round_to_even':
        return emulate_with_scalar('round_to_even', simd_ext, typ, ['v', 'v'])
    ppcop = { 'trunc': 'trunc', 'ceil': 'ceil', 'floor': 'floor' }
    return 'return vec_{op}({in0});'.format(op=ppcop[op], **fmtspec)

# -----------------------------------------------------------------------------

def fma(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v', 'v'])
    elif typ in common.iutypes:
        if op == 'fma':
            return \
            'return vec_add(vec_mul({in0}, {in1}), {in2});'.format(**fmtspec)
        elif op == 'fms':
            return \
            'return vec_sub(vec_mul({in0}, {in1}), {in2});'.format(**fmtspec)
        elif op == 'fnma':
            return \
            'return vec_sub({in2}, vec_mul({in0}, {in1}));'.format(**fmtspec)
        elif op == 'fnms':
            return '''return vec_sub(nsimd_neg_{simd_ext}_{typ}({in2}),
                                 vec_mul({in0}, {in1}));'''.format(**fmtspec)
    elif typ in common.ftypes:
        ppcop = { 'fma': 'vec_madd', 'fms': 'vec_msub', 'fnms': 'vec_nmadd',
                  'fnma': 'vec_nmsub' }
        return 'return {ppcop}({in0}, {in1}, {in2});'. \
               format(ppcop=ppcop[op], **fmtspec)

# -----------------------------------------------------------------------------

def neg1(simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code('neg', simd_ext, typ, ['v', 'v'])
    elif typ in common.itypes or typ in common.ftypes:
        return 'return vec_neg({in0});'.format(**fmtspec)
    else:
        return 'return vec_sub({zeros}, {in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def recs1(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v'])
    elif op == 'rec':
        return 'return vec_div(vec_splats(({typ})1), {in0});'. \
               format(**fmtspec)
    elif op in ['rec8', 'rec11']:
        return 'return vec_re({in0});'.format(**fmtspec)
    elif op in ['rsqrt8', 'rsqrt11']:
        return 'return vec_rsqrte({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------

def loadl(aligned, simd_ext, typ):
    return \
    '''/* This can surely be improved but it is not our priority. */
       return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}(
                nsimd_load{align}_{simd_ext}_{typ}(
                  {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \
                  format(align='a' if aligned else 'u',
                         zero='nsimd_f32_to_f16(0.0f)' if typ == 'f16'
                         else '({})0'.format(typ), **fmtspec)

# -----------------------------------------------------------------------------

def storel(aligned, simd_ext, typ):
    return \
    '''/* This can surely be improved but it is not our priority. */
       nsimd_store{align}_{simd_ext}_{typ}({in0},
         nsimd_if_else1_{simd_ext}_{typ}({in1},
           nsimd_set1_{simd_ext}_{typ}({one}),
           nsimd_set1_{simd_ext}_{typ}({zero})));'''. \
           format(align='a' if aligned else 'u',
                  one='nsimd_f32_to_f16(1.0f)' if typ == 'f16'
                  else '({})1'.format(typ),
                  zero='nsimd_f32_to_f16(0.0f)' if typ == 'f16'
                  else '({})0'.format(typ), **fmtspec)

# -----------------------------------------------------------------------------

def allany1(op, simd_ext, typ):
    binop = '&&' if op == 'all' else '||'
    if typ == 'f16':
        return \
        '''return nsimd_{op}_{simd_ext}_f32({in0}.v0) {binop}
                  nsimd_{op}_{simd_ext}_f32({in0}.v1);'''. \
                  format(op=op, binop=binop, **fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return 'return {in0}.v0 {binop} {in0}.v1;'. \
               format(binop=binop, **fmtspec)
    return 'return vec_{op}_ne({in0}, ({lzeros}));'.format(op=op, **fmtspec)

# -----------------------------------------------------------------------------

def nbtrue1(simd_ext, typ):
    if typ == 'f16':
        return \
        '''return nsimd_nbtrue_{simd_ext}_f32({in0}.v0) +
                  nsimd_nbtrue_{simd_ext}_f32({in0}.v1);'''. \
                  format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return 'return -(int)((i64)({in0}.v0) + (i64)({in0}.v1));'. \
               format(**fmtspec)
    return 'return {};'. \
           format(' + '.join('(vec_extract({in0}, {i}) ? 1 : 0)'. \
                             format(i=i, **fmtspec) \
                             for i in range(get_len(typ))))

# -----------------------------------------------------------------------------

def reinterpretl1(simd_ext, from_typ, to_typ):
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    elif simd_ext == 'vmx' and from_typ in ['f64', 'i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_vl{to_typ} ret;
           ret.v0 = {in0}.v0;
           ret.v1 = {in0}.v1;
           return ret;'''.format(**fmtspec)
    elif from_typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vl{to_typ} ret =
               (__vector __bool short)vec_splats(
                   (u16)vec_extract({in0}.v0, 0));
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v0, 1), ret, 1);
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v0, 2), ret, 2);
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v0, 3), ret, 3);
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v1, 0), ret, 4);
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v1, 1), ret, 5);
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v1, 2), ret, 6);
           ret = (__vector __bool short)vec_insert(
                     (u16)vec_extract({in0}.v1, 3), ret, 7);
           return ret;'''.format(**fmtspec)
    elif to_typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vlf16 ret;
           ret.v0 = (__vector __bool int)vec_splats(
                        (u32)(vec_extract({in0}, 0) ? -1 : 0));
           ret.v0 = (__vector __bool int)vec_insert(
                        (u32)(vec_extract({in0}, 1) ? -1 : 0), ret.v0, 1);
           ret.v0 = (__vector __bool int)vec_insert(
                        (u32)(vec_extract({in0}, 2) ? -1 : 0), ret.v0, 2);
           ret.v0 = (__vector __bool int)vec_insert(
                        (u32)(vec_extract({in0}, 3) ? -1 : 0), ret.v0, 3);
           ret.v1 = (__vector __bool int)vec_splats(
                        (u32)(vec_extract({in0}, 4) ? -1 : 0));
           ret.v1 = (__vector __bool int)vec_insert(
                        (u32)(vec_extract({in0}, 5) ? -1 : 0), ret.v1, 1);
           ret.v1 = (__vector __bool int)vec_insert(
                        (u32)(vec_extract({in0}, 6) ? -1 : 0), ret.v1, 2);
           ret.v1 = (__vector __bool int)vec_insert(
                        (u32)(vec_extract({in0}, 7) ? -1 : 0), ret.v1, 3);
           return ret;'''.format(**fmtspec)
    else:
        return 'return ({ppc_to_typ}){in0};'. \
               format(ppc_to_typ=native_typel(to_typ), **fmtspec)

# -----------------------------------------------------------------------------

def convert1(simd_ext, from_typ, to_typ):
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    elif from_typ == 'f16' and to_typ == 'u16':
        return \
        '''return vec_pack((__vector unsigned int)vec_ctu({in0}.v0, 0),
                           (__vector unsigned int)vec_ctu({in0}.v1, 0));'''. \
                           format(**fmtspec)
    elif from_typ == 'f16' and to_typ == 'i16':
        return \
        '''return vec_pack((__vector signed int)vec_cts({in0}.v0, 0),
                           (__vector signed int)vec_cts({in0}.v1, 0));'''. \
                           format(**fmtspec)
    elif from_typ == 'u16' and to_typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16 ret;
           /* Unpack extends the sign, we need to remove the extra 1s */
           __vector int mask = vec_splats((int)0xFFFF);
           ret.v0 = vec_ctf(vec_and(vec_unpackh((__vector short){in0}), mask),
                            0);
           ret.v1 = vec_ctf(vec_and(vec_unpackl((__vector short){in0}), mask),
                            0);
           return ret;'''.format(**fmtspec)
    elif from_typ == 'i16' and to_typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = vec_ctf(vec_unpackh({in0}), 0);
                  ret.v1 = vec_ctf(vec_unpackl({in0}), 0);
                  return ret;'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, to_typ):
        return '''nsimd_{simd_ext}_v{to_typ} ret;
                  ret.v0 = nsimd_scalar_cvt_{to_typ}_{from_typ}({in0}.v0);
                  ret.v1 = nsimd_scalar_cvt_{to_typ}_{from_typ}({in0}.v1);
                  return ret;'''.format(**fmtspec)
    elif from_typ in ['f32', 'f64'] and to_typ in ['i32', 'i64']:
        return 'return vec_cts({in0}, 0);'.format(**fmtspec)
    elif from_typ in ['f32', 'f64'] and to_typ in ['u32', 'u64']:
        return 'return vec_ctu({in0}, 0);'.format(**fmtspec)
    elif from_typ in ['i32', 'i64', 'u32', 'u64'] and to_typ in ['f32', 'f64']:
        return 'return vec_ctf({in0}, 0);'.format(**fmtspec)
    elif from_typ in common.iutypes and to_typ in common.iutypes:
        return 'return ({ppctyp}){in0};'. \
               format(ppctyp=native_type(to_typ), **fmtspec)

# -----------------------------------------------------------------------------

def reinterpret1(simd_ext, from_typ, to_typ):
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    elif simd_ext == 'vmx' and from_typ in ['f64', 'i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_v{to_typ} ret;
           ret.v0 = nsimd_scalar_reinterpret_{to_typ}_{from_typ}({in0}.v0);
           ret.v1 = nsimd_scalar_reinterpret_{to_typ}_{from_typ}({in0}.v1);
           return ret;'''.format(**fmtspec)
    elif from_typ == 'f16' and to_typ == 'u16':
        return \
        '''nsimd_{simd_ext}_vu16 ret;
           ret = vec_splats(nsimd_f32_to_u16(vec_extract({in0}.v0, 0)));
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 1)),
                            ret, 1);
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 2)),
                            ret, 2);
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 3)),
                            ret, 3);
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 0)),
                            ret, 4);
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 1)),
                            ret, 5);
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 2)),
                            ret, 6);
           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 3)),
                            ret, 7);
           return ret;'''.format(**fmtspec)
    elif from_typ == 'f16' and to_typ == 'i16':
        return \
        '''nsimd_{simd_ext}_vi16 ret;
           ret = vec_splats(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v0, 0))));
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v0, 1))), ret, 1);
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v0, 2))), ret, 2);
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v0, 3))), ret, 3);
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v1, 0))), ret, 4);
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v1, 1))), ret, 5);
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v1, 2))), ret, 6);
           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(
                     nsimd_f32_to_u16(vec_extract({in0}.v1, 3))), ret, 7);
           return ret;'''.format(**fmtspec)
    elif from_typ == 'u16' and to_typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16 ret;
           ret.v0 = vec_splats(nsimd_u16_to_f32(vec_extract({in0}, 0)));
           ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 1)),
                               ret.v0, 1);
           ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 2)),
                               ret.v0, 2);
           ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 3)),
                               ret.v0, 3);
           ret.v1 = vec_splats(nsimd_u16_to_f32(vec_extract({in0}, 4)));
           ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 5)),
                               ret.v1, 1);
           ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 6)),
                               ret.v1, 2);
           ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 7)),
                               ret.v1, 3);
           return ret;'''.format(**fmtspec)
    elif from_typ == 'i16' and to_typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16 ret;
           ret.v0 = vec_splats(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 0))));
           ret.v0 = vec_insert(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 1))), ret.v0, 1);
           ret.v0 = vec_insert(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 2))), ret.v0, 2);
           ret.v0 = vec_insert(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 3))), ret.v0, 3);
           ret.v1 = vec_splats(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 4))));
           ret.v1 = vec_insert(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 5))), ret.v1, 1);
           ret.v1 = vec_insert(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 6))), ret.v1, 2);
           ret.v1 = vec_insert(nsimd_u16_to_f32(
                        nsimd_scalar_reinterpret_u16_i16(
                            vec_extract({in0}, 7))), ret.v1, 3);
           return ret;'''.format(**fmtspec)
    else:
        return 'return ({ppc_typ}){in0};'. \
               format(ppc_typ=native_type(to_typ), **fmtspec)

# -----------------------------------------------------------------------------

def reverse1(simd_ext, typ):
    if typ == 'f16':
        return emulate_f16('reverse', simd_ext, ['v', 'v'])
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {in0}.v1;
                  ret.v1 = {in0}.v0;
                  return ret;'''.format(**fmtspec)
    elif typ in ['i8', 'u8']:
        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)
                                  {{ 15, 14, 13, 12, 11, 10, 9, 8,
                                      7,  6,  5,  4,  3,  2, 1, 0 }});'''. \
                                      format(**fmtspec)
    elif typ in ['i16', 'u16']:
        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)
                                  {{ 14, 15, 12, 13, 10, 11, 8, 9,
                                      6,  7,  4,  5,  2,  3, 0, 1 }});'''. \
                                      format(**fmtspec)
    elif typ in ['i32', 'u32', 'f32']:
        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)
                                  {{ 12, 13, 14, 15,  8,  9, 10, 11,
                                      4,  5,  6,  7,  0,  1,  2,  3 }});'''. \
                                      format(**fmtspec)
    elif typ in ['f64', 'i64', 'u64']:
        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)
                                  {{ 8, 9, 10, 11, 12, 13, 14, 15,
                                     0, 1,  2,  3,  4,  5,  6,  7  }});'''. \
                                      format(**fmtspec)

# -----------------------------------------------------------------------------

def addv(simd_ext, typ):
    if typ == 'f16':
        return '''return nsimd_f32_to_f16(
                    nsimd_addv_{simd_ext}_f32({in0}.v0) +
                    nsimd_addv_{simd_ext}_f32({in0}.v1));'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return 'return {in0}.v0 + {in0}.v1;'.format(**fmtspec)
    return 'return ({})({});'. \
           format(typ, ' + '.join('vec_extract({in0}, {i})'. \
                                  format(i=i, **fmtspec) \
                                  for i in range(get_len(typ))))

# -----------------------------------------------------------------------------

def add_sub_s(op, simd_ext, typ):
    if has_to_be_emulated(simd_ext, typ):
        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])
    if typ in common.ftypes:
        return 'return vec_{op}({in0}, {in1});'.format(op=op[:-1], **fmtspec)
    elif typ in ['i64', 'u64']:
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret = vec_splats(nsimd_scalar_{op}_{typ}(
                            vec_extract({in0}, 0), vec_extract({in1}, 0)));
                  ret = vec_insert(nsimd_scalar_{op}_{typ}(
                            vec_extract({in0}, 1), vec_extract({in1}, 1)),
                            ret, 1);
                  return ret;'''.format(op=op, **fmtspec)
    return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec)

# -----------------------------------------------------------------------------

def upcvt1(simd_ext, from_typ, to_typ):
    if from_typ in ['i8', 'u8'] and to_typ == 'f16':
        return '''nsimd_{simd_ext}_vf16x2 ret;
                  nsimd_{simd_ext}_vi16x2 tmp;
                  tmp = nsimd_upcvt_{simd_ext}_i16_{from_typ}(a0);
                  ret.v0 = nsimd_cvt_{simd_ext}_f16_i16(tmp.v0);
                  ret.v1 = nsimd_cvt_{simd_ext}_f16_i16(tmp.v1);
                  return ret;'''.format(**fmtspec)
    elif from_typ == 'f16' and to_typ == 'f32':
        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                  ret.v0 = {in0}.v0;
                  ret.v1 = {in0}.v1;
                  return ret;'''.format(**fmtspec)
    elif from_typ == 'f16' and to_typ in ['i32', 'u32']:
        sign = 'u' if to_typ[0] == 'u' else 's'
        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                  ret.v0 = vec_ct{sign}({in0}.v0, 0);
                  ret.v1 = vec_ct{sign}({in0}.v1, 0);
                  return ret;'''.format(sign=sign, **fmtspec)
    elif from_typ == 'f32' and to_typ in ['f64', 'i64', 'u64']:
        if simd_ext == 'vmx':
            return '''nsimd_vmx_v{to_typ}x2 ret;
                      ret.v0.v0 = ({to_typ})vec_extract({in0}, 0);
                      ret.v0.v1 = ({to_typ})vec_extract({in0}, 1);
                      ret.v1.v0 = ({to_typ})vec_extract({in0}, 2);
                      ret.v1.v1 = ({to_typ})vec_extract({in0}, 3);
                      return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_vsx_v{to_typ}x2 ret;
               ret.v0 = vec_splats(({to_typ})vec_extract({in0}, 0));
               ret.v0 = vec_insert(({to_typ})vec_extract({in0}, 1), ret.v0, 1);
               ret.v1 = vec_splats(({to_typ})vec_extract({in0}, 2));
               ret.v1 = vec_insert(({to_typ})vec_extract({in0}, 3), ret.v1, 1);
               return ret;'''.format(**fmtspec)
    elif (from_typ in ['i16', 'u16'] and to_typ == 'f32') or \
         (from_typ in ['i32', 'u32'] and to_typ == 'f64'):
        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                  nsimd_{simd_ext}_v{sto_typ}x2 tmp;
                  tmp = nsimd_upcvt_{simd_ext}_{sto_typ}_{from_typ}({in0});
                  ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{sto_typ}(tmp.v0);
                  ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{sto_typ}(tmp.v1);
                  return ret;'''.format(sto_typ=from_typ[0] + to_typ[1:],
                                        **fmtspec)
    elif from_typ in ['u8', 'u16']:
        mask='(i{})0x{}'.format(to_typ[1:], 'F' * (int(from_typ[1:]) // 4))
        ppc_sto_typ = native_type('i' + to_typ[1:])
        ppc_sfrom_typ = '({})'.format(native_type('i' + from_typ[1:]))
        ppc_to_typ = '({})'.format(native_type(to_typ)) \
                     if to_typ in common.utypes else ''
        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                  {ppc_sto_typ} mask = vec_splats({mask});
                  ret.v0 = {ppc_to_typ}vec_and(
                               vec_unpackh({ppc_sfrom_typ}{in0}), mask);
                  ret.v1 = {ppc_to_typ}vec_and(
                               vec_unpackl({ppc_sfrom_typ}{in0}), mask);
                  return ret;'''.format(mask=mask, ppc_sto_typ=ppc_sto_typ,
                                        ppc_sfrom_typ=ppc_sfrom_typ,
                                        ppc_to_typ=ppc_to_typ, **fmtspec)
    elif from_typ in ['i8', 'i16']:
        ppc_to_typ = '({})'.format(native_type(to_typ)) \
                     if to_typ in common.utypes else ''
        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;
                  ret.v0 = {ppc_to_typ}vec_unpackh({in0});
                  ret.v1 = {ppc_to_typ}vec_unpackl({in0});
                  return ret;'''.format(ppc_to_typ=ppc_to_typ, **fmtspec)
    elif from_typ in ['i32', 'u32']:
        if simd_ext == 'vmx':
            return '''nsimd_vmx_v{to_typ}x2 ret;
                      ret.v0.v0 = ({to_typ})vec_extract({in0}, 0);
                      ret.v0.v1 = ({to_typ})vec_extract({in0}, 1);
                      ret.v1.v0 = ({to_typ})vec_extract({in0}, 2);
                      ret.v1.v1 = ({to_typ})vec_extract({in0}, 3);
                      return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_vsx_v{to_typ}x2 ret;
               ret.v0 = vec_splats(({to_typ})vec_extract({in0}, 0));
               ret.v0 = vec_insert(({to_typ})vec_extract({in0}, 1), ret.v0, 1);
               ret.v1 = vec_splats(({to_typ})vec_extract({in0}, 2));
               ret.v1 = vec_insert(({to_typ})vec_extract({in0}, 3), ret.v1, 1);
               return ret;'''.format(**fmtspec)

# -----------------------------------------------------------------------------

def downcvt1(simd_ext, from_typ, to_typ):
    if from_typ in ['f64', 'i64', 'u64']:
        if simd_ext == 'vmx':
            return '''nsimd_vmx_v{to_typ} ret;
                      ret = vec_splats(({to_typ}){in0}.v0);
                      ret = vec_insert(({to_typ}){in0}.v1, ret, 1);
                      ret = vec_insert(({to_typ}){in1}.v0, ret, 2);
                      ret = vec_insert(({to_typ}){in1}.v1, ret, 3);
                      return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_vsx_v{to_typ} ret;
               ret = vec_splats(({to_typ})vec_extract({in0}, 0));
               ret = vec_insert(({to_typ})vec_extract({in0}, 1), ret, 1);
               ret = vec_insert(({to_typ})vec_extract({in1}, 0), ret, 2);
               ret = vec_insert(({to_typ})vec_extract({in1}, 1), ret, 3);
               return ret;'''.format(**fmtspec)
    elif from_typ in common.iutypes and to_typ in common.iutypes:
        return 'return {cast}vec_pack({in0}, {in1});'. \
               format(cast='({})'.format(native_type(to_typ)) \
                      if from_typ[0] != to_typ[0] else '', **fmtspec)
    elif from_typ == 'f32' and to_typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = {in0};
                  ret.v1 = {in1};
                  return ret;'''.format(**fmtspec)
    elif from_typ == 'f32' and to_typ in common.iutypes:
        return 'return vec_pack(vec_ct{s}({in0}, 0), vec_ct{s}({in1}, 0));'. \
               format(s='s' if to_typ == 'i16' else 'u', **fmtspec)
    elif from_typ in common.iutypes and to_typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = vec_ctf({in0}, 0);
                  ret.v1 = vec_ctf({in1}, 0);
                  return ret;'''.format(**fmtspec)
    elif from_typ == 'f16':
        return \
        '''return vec_pack(vec_pack(vec_ct{s}({in0}.v0, 0),
                                    vec_ct{s}({in0}.v1, 0)),
                           vec_pack(vec_ct{s}({in1}.v0, 0),
                                    vec_ct{s}({in1}.v1, 0)));'''. \
                                    format(s='s' if to_typ == 'i8' else 'u',
                                           **fmtspec)

# -----------------------------------------------------------------------------

def unzip(func, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in0}.v1);
                  ret.v1 = nsimd_{func}_{simd_ext}_f32({in1}.v0, {in1}.v1);
                  return ret;'''.format(func=func, **fmtspec)
    elif typ in ['f64', 'i64', 'u64']:
        if simd_ext == 'vmx':
            return '''nsimd_vmx_v{typ} ret;
                      ret.v0 = {in0}.v{i};
                      ret.v1 = {in1}.v{i};
                      return ret;'''.format(i=0 if func == 'unziplo' else 1,
                                            **fmtspec)
        else:
            return '''nsimd_vsx_v{typ} ret;
                      ret = vec_splats(vec_extract({in0}, {i}));
                      ret = vec_insert(vec_extract({in1}, {i}), ret, 1);
                      return ret;'''.format(i=0 if func == 'unziplo' else 1,
                                            **fmtspec)
    elif typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32', 'f32']:
        perm = []
        le = get_len(typ)
        for i in range(le):
            sz = int(typ[1:]) // 8
            for j in range(0, sz):
                perm += ['(unsigned char)' + str(2 * sz * i + \
                         (0 if func == 'unziplo' else sz) + j)]
        return \
        '''__vector unsigned char perm = {{ {perm} }};
           return vec_perm({in0}, {in1}, perm);'''. \
           format(perm=', '.join(perm), **fmtspec)

# -----------------------------------------------------------------------------

def zip(op, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = vec_splats(vec_extract({in0}.v{i}, 0));
                  ret.v0 = vec_insert(vec_extract({in0}.v{i}, 1), ret.v0, 2);
                  ret.v1 = vec_splats(vec_extract({in0}.v{i}, 2));
                  ret.v1 = vec_insert(vec_extract({in0}.v{i}, 3), ret.v1, 2);
                  ret.v0 = vec_insert(vec_extract({in1}.v{i}, 0), ret.v0, 1);
                  ret.v0 = vec_insert(vec_extract({in1}.v{i}, 1), ret.v0, 3);
                  ret.v1 = vec_insert(vec_extract({in1}.v{i}, 2), ret.v1, 1);
                  ret.v1 = vec_insert(vec_extract({in1}.v{i}, 3), ret.v1, 3);
                  return ret;'''.format(i=0 if op == 'ziplo' else 1,
                                        **fmtspec)
    elif simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']:
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {in0}.v{i};
                  ret.v1 = {in1}.v{i};
                  return ret;'''.format(i='1' if op == 'ziphi' else '0',
                                        **fmtspec)
    return 'return vec_merge{suf}({in0}, {in1});'. \
           format(suf='l' if op == 'ziphi' else 'h', **fmtspec)

# -----------------------------------------------------------------------------

def zip_unzip_basic(op, simd_ext, typ):
    return \
    '''nsimd_{simd_ext}_v{typ}x2 ret;
       ret.v0 = nsimd_{pre}ziplo_{simd_ext}_{typ}({in0}, {in1});
       ret.v1 = nsimd_{pre}ziphi_{simd_ext}_{typ}({in0}, {in1});
       return ret;'''.format(pre='un' if op == 'unzip' else '', **fmtspec)

# -----------------------------------------------------------------------------

def to_mask(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = (__vector float){in0}.v0;
                  ret.v1 = (__vector float){in0}.v1;
                  return ret;'''.format(**fmtspec)
    if simd_ext == 'vmx' and typ in ['f64', 'i64']:
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = nsimd_scalar_reinterpret_{typ}_u64({in0}.v0);
                  ret.v1 = nsimd_scalar_reinterpret_{typ}_u64({in0}.v1);
                  return ret;'''.format(**fmtspec)
    elif simd_ext == 'vmx' and typ == 'u64':
        return '''nsimd_{simd_ext}_vu64 ret;
                  ret.v0 = {in0}.v0;
                  ret.v1 = {in0}.v1;
                  return ret;'''.format(**fmtspec)
    return 'return ({ppc_typ}){in0};'. \
           format(ppc_typ=native_type(typ), **fmtspec)

# -----------------------------------------------------------------------------

def iota(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = vec_splats(0.0f);
                  ret.v0 = vec_insert(1.0f, ret.v0, 1);
                  ret.v0 = vec_insert(2.0f, ret.v0, 2);
                  ret.v0 = vec_insert(3.0f, ret.v0, 3);
                  ret.v1 = vec_splats(4.0f);
                  ret.v1 = vec_insert(5.0f, ret.v1, 1);
                  ret.v1 = vec_insert(6.0f, ret.v1, 2);
                  ret.v1 = vec_insert(7.0f, ret.v1, 3);
                  return ret;'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_vmx_v{typ} ret;
                  ret.v0 = ({typ})0;
                  ret.v1 = ({typ})1;
                  return ret;'''.format(**fmtspec)
    return 'nsimd_{simd_ext}_v{typ} ret;\n' \
           'ret = vec_splats(({typ})0);\n'.format(**fmtspec) + \
           '\n'.join('ret = vec_insert(({}){}, ret, {});'.format(typ, i, i) \
                     for i in range(1, get_len(typ))) + \
           '\nreturn ret;'

# -----------------------------------------------------------------------------

def mask_for_loop_tail(simd_ext, typ):
    le = get_len(typ)
    if typ == 'f16':
        threshold = 'nsimd_f32_to_f16((f32)({in1} - {in0}))'.format(**fmtspec)
    else:
        threshold = '({typ})({in1} - {in0})'.format(**fmtspec)
    return '''if ({in0} >= {in1}) {{
                return nsimd_set1l_{simd_ext}_{typ}(0);
              }}
              if ({in1} - {in0} < {le}) {{
                nsimd_{simd_ext}_v{typ} n =
                      nsimd_set1_{simd_ext}_{typ}({threshold});
                return nsimd_lt_{simd_ext}_{typ}(
                           nsimd_iota_{simd_ext}_{typ}(), n);
              }} else {{
                return nsimd_set1l_{simd_ext}_{typ}(1);
              }}'''.format(le=le, threshold=threshold, **fmtspec)

# -----------------------------------------------------------------------------

def scatter(simd_ext, typ):
    le = get_len(typ)
    if typ == 'f16':
        return \
        '''{in0}[vec_extract({in1}, 0)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v0, 0));
           {in0}[vec_extract({in1}, 1)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v0, 1));
           {in0}[vec_extract({in1}, 2)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v0, 2));
           {in0}[vec_extract({in1}, 3)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v0, 3));
           {in0}[vec_extract({in1}, 4)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v1, 0));
           {in0}[vec_extract({in1}, 5)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v1, 1));
           {in0}[vec_extract({in1}, 6)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v1, 2));
           {in0}[vec_extract({in1}, 7)] = nsimd_f32_to_f16(
                                              vec_extract({in2}.v1, 3));'''. \
                                              format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''{in0}[{in1}.v0] = {in2}.v0;
                  {in0}[{in1}.v1] = {in2}.v1;'''.format(**fmtspec)
    return '\n'.join(['{in0}[vec_extract({in1}, {i})] = ' \
                      'vec_extract({in2}, {i});'.format(i=i, **fmtspec) \
                      for i in range(get_len(typ))])

# -----------------------------------------------------------------------------

def gather(simd_ext, typ):
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_v{typ} ret;
           ret.v0 = vec_splats(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]));
           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),
                               ret.v0, 1);
           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),
                               ret.v0, 2);
           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),
                               ret.v0, 3);
           ret.v1 = vec_splats(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]));
           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),
                               ret.v1, 1);
           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),
                               ret.v1, 2);
           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),
                               ret.v1, 3);
           return ret;'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {in0}[{in1}.v0];
                  ret.v1 = {in0}[{in1}.v1];
                  return ret;'''.format(**fmtspec)
    return '''nsimd_{simd_ext}_v{typ} ret;
              ret = vec_splats({in0}[vec_extract({in1}, 0)]);
              '''.format(**fmtspec) + \
           '\n'.join('ret = vec_insert({in0}[vec_extract({in1}, {i})], ' \
                     'ret, {i});'.format(i=i, **fmtspec) \
                     for i in range(1, get_len(typ))) + '\n' + \
           'return ret;'

# -----------------------------------------------------------------------------

def gather_linear(simd_ext, typ):
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_v{typ} ret;
           ret.v0 = vec_splats(nsimd_f16_to_f32({in0}[0]));
           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[{in1}]), ret.v0, 1);
           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[2 * {in1}]), ret.v0, 2);
           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[3 * {in1}]), ret.v0, 3);
           ret.v1 = vec_splats(nsimd_f16_to_f32({in0}[4 * {in1}]));
           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[5 * {in1}]), ret.v1, 1);
           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[6 * {in1}]), ret.v1, 2);
           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[7 * {in1}]), ret.v1, 3);
           return ret;'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''nsimd_{simd_ext}_v{typ} ret;
                  ret.v0 = {in0}[0];
                  ret.v1 = {in0}[{in1}];
                  return ret;'''.format(**fmtspec)
    return '''nsimd_{simd_ext}_v{typ} ret;
           ret = vec_splats({in0}[0]);
           '''.format(**fmtspec) + \
        '\n'.join('ret = vec_insert({in0}[{in1} * {i}], ret, {i});'. \
                  format(i=i, **fmtspec) for i in range(1, get_len(typ))) + \
        '\nreturn ret;'

# -----------------------------------------------------------------------------

def scatter_linear(simd_ext, typ):
    if typ == 'f16':
        return \
        '''{in0}[0] = nsimd_f32_to_f16(vec_extract({in2}.v0, 0));
           {in0}[{in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 1));
           {in0}[2 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 2));
           {in0}[3 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 3));
           {in0}[4 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 0));
           {in0}[5 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 1));
           {in0}[6 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 2));
           {in0}[7 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 3));'''. \
           format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''{in0}[0] = {in2}.v0;
                  {in0}[{in1}] = {in2}.v1;'''.format(**fmtspec)
    return '\n'.join(['{in0}[{in1} * {i}] = vec_extract({in2}, {i});'. \
                      format(i=i, **fmtspec) for i in range(get_len(typ))])

# -----------------------------------------------------------------------------

def maskoz_load(oz, simd_ext, typ):
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16 ret;
           ret.v0 = vec_splats(0.0f);
           ret.v0 = vec_insert(vec_extract({in0}.v0, 0) ?
                               nsimd_f16_to_f32({in1}[0]) : {oz0}, ret.v0, 0);
           ret.v0 = vec_insert(vec_extract({in0}.v0, 1) ?
                               nsimd_f16_to_f32({in1}[1]) : {oz1}, ret.v0, 1);
           ret.v0 = vec_insert(vec_extract({in0}.v0, 2) ?
                               nsimd_f16_to_f32({in1}[2]) : {oz2}, ret.v0, 2);
           ret.v0 = vec_insert(vec_extract({in0}.v0, 3) ?
                               nsimd_f16_to_f32({in1}[3]) : {oz3}, ret.v0, 3);
           ret.v1 = ret.v0;
           ret.v1 = vec_insert(vec_extract({in0}.v1, 0) ?
                               nsimd_f16_to_f32({in1}[4]) : {oz4}, ret.v1, 0);
           ret.v1 = vec_insert(vec_extract({in0}.v1, 1) ?
                               nsimd_f16_to_f32({in1}[5]) : {oz5}, ret.v1, 1);
           ret.v1 = vec_insert(vec_extract({in0}.v1, 2) ?
                               nsimd_f16_to_f32({in1}[6]) : {oz6}, ret.v1, 2);
           ret.v1 = vec_insert(vec_extract({in0}.v1, 3) ?
                               nsimd_f16_to_f32({in1}[7]) : {oz7}, ret.v1, 3);
           return ret;'''. \
           format(oz0='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 0)',
                  oz1='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 1)',
                  oz2='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 2)',
                  oz3='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 3)',
                  oz4='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 0)',
                  oz5='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 1)',
                  oz6='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 2)',
                  oz7='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 3)',
                  **fmtspec).format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        if oz == 'z':
            return '''nsimd_{simd_ext}_v{typ} ret;
                      ret.v0 = {in0}.v0 ? {in1}[0] : ({typ})0;
                      ret.v1 = {in0}.v1 ? {in1}[1] : ({typ})0;
                      return ret;'''.format(**fmtspec)
        else:
            return '''nsimd_{simd_ext}_v{typ} ret;
                      ret.v0 = {in0}.v0 ? {in1}[0] : {in2}.v0;
                      ret.v1 = {in0}.v1 ? {in1}[1] : {in2}.v1;
                      return ret;'''.format(**fmtspec)
    return 'nsimd_{simd_ext}_v{typ} ret = {zeros};\n'.format(**fmtspec) + \
           '\n'.join(
           '''if (vec_extract({in0}, {i})) {{
                ret = vec_insert({in1}[{i}], ret, {i});
              }} else {{
                ret = vec_insert({v}, ret, {i});
              }}'''.format(i=i, v='({})0'.format(typ) if oz == 'z' \
                                else 'vec_extract({in2}, {i})'. \
                                     format(i=i, **fmtspec), **fmtspec) \
                                     for i in range(get_len(typ))) + \
                                     '\nreturn ret;'

# -----------------------------------------------------------------------------

def mask_store(simd_ext, typ):
    if typ == 'f16':
        return \
        '''if (vec_extract({in0}.v0, 0)) {{
             {in1}[0] = nsimd_f32_to_f16(vec_extract({in2}.v0, 0));
           }}
           if (vec_extract({in0}.v0, 1)) {{
             {in1}[1] = nsimd_f32_to_f16(vec_extract({in2}.v0, 1));
           }}
           if (vec_extract({in0}.v0, 2)) {{
             {in1}[2] = nsimd_f32_to_f16(vec_extract({in2}.v0, 2));
           }}
           if (vec_extract({in0}.v0, 3)) {{
             {in1}[3] = nsimd_f32_to_f16(vec_extract({in2}.v0, 3));
           }}
           if (vec_extract({in0}.v1, 0)) {{
             {in1}[4] = nsimd_f32_to_f16(vec_extract({in2}.v1, 0));
           }}
           if (vec_extract({in0}.v1, 1)) {{
             {in1}[5] = nsimd_f32_to_f16(vec_extract({in2}.v1, 1));
           }}
           if (vec_extract({in0}.v1, 2)) {{
             {in1}[6] = nsimd_f32_to_f16(vec_extract({in2}.v1, 2));
           }}
           if (vec_extract({in0}.v1, 3)) {{
             {in1}[7] = nsimd_f32_to_f16(vec_extract({in2}.v1, 3));
           }}'''.format(**fmtspec)
    elif has_to_be_emulated(simd_ext, typ):
        return '''if ({in0}.v0) {{
                    {in1}[0] = {in2}.v0;
                  }}
                  if ({in0}.v1) {{
                    {in1}[1] = {in2}.v1;
                  }}'''.format(**fmtspec)
    return '\n'.join(
           '''if (vec_extract({in0}, {i})) {{
                {in1}[{i}] = vec_extract({in2}, {i});
              }}'''.format(i=i, **fmtspec) for i in range(get_len(typ)))

# -----------------------------------------------------------------------------

def to_logical(simd_ext, typ):
    if typ == 'f16':
        return emulate_f16('to_logical', simd_ext, ['l', 'v'])
    elif has_to_be_emulated(simd_ext, typ):
        if typ in ['i64', 'u64']:
            return '''nsimd_{simd_ext}_vl{typ} ret;
                      ret.v0 = (u64)({in0}.v0 != ({typ})0 ? -1 : 0);
                      ret.v1 = (u64)({in0}.v1 != ({typ})0 ? -1 : 0);
                      return ret;'''.format(**fmtspec)
        elif typ == 'f64':
            return '''nsimd_{simd_ext}_vl{typ} ret;
                      ret.v0 = (u64)(nsimd_scalar_reinterpret_u64_f64(
                                       {in0}.v0) != (u64)0 ? -1 : 0);
                      ret.v1 = (u64)(nsimd_scalar_reinterpret_u64_f64(
                                       {in0}.v1) != (u64)0 ? -1 : 0);
                      return ret;'''.format(**fmtspec)
    elif typ in common.iutypes:
        return 'return nsimd_ne_{simd_ext}_{typ}({in0}, {zeros});'. \
               format(**fmtspec)
    elif typ in ['f32', 'f64']:
        return '''return nsimd_ne_{simd_ext}_u{typnbits}(
                             nsimd_reinterpret_{simd_ext}_u{typnbits}_{typ}(
                                 {in0}), vec_splats((u{typnbits})0));'''. \
                                 format(**fmtspec)

# -----------------------------------------------------------------------------

def get_impl(opts, func, simd_ext, from_typ, to_typ):
    global fmtspec

    fmtspec = {
        'simd_ext': simd_ext,
        'typ': from_typ,
        'styp': get_type(opts, simd_ext, from_typ, to_typ),
        'from_typ': from_typ,
        'to_typ': to_typ,
        'in0': common.in0,
        'in1': common.in1,
        'in2': common.in2,
        'in3': common.in3,
        'in4': common.in4,
        'in5': common.in5,
        'zeros': 'vec_splats(({})0)'.format(from_typ),
        'lzeros': '({})vec_splats((u{})0)'. \
                  format(native_typel(from_typ), from_typ[1:]) \
                  if not has_to_be_emulated(simd_ext, from_typ) else '',
        'typnbits': from_typ[1:]
    }

    impls = {
        'loada': 'load1234(simd_ext, from_typ, 1, True)',
        'load2a': 'load1234(simd_ext, from_typ, 2, True)',
        'load3a': 'load1234(simd_ext, from_typ, 3, True)',
        'load4a': 'load1234(simd_ext, from_typ, 4, True)',
        'loadu': 'load1234(simd_ext, from_typ, 1, False)',
        'load2u': 'load1234(simd_ext, from_typ, 2, False)',
        'load3u': 'load1234(simd_ext, from_typ, 3, False)',
        'load4u': 'load1234(simd_ext, from_typ, 4, False)',
        'storea': 'store1234(simd_ext, from_typ, 1, True)',
        'store2a': 'store1234(simd_ext, from_typ, 2, True)',
        'store3a': 'store1234(simd_ext, from_typ, 3, True)',
        'store4a': 'store1234(simd_ext, from_typ, 4, True)',
        'storeu': 'store1234(simd_ext, from_typ, 1, False)',
        'store2u': 'store1234(simd_ext, from_typ, 2, False)',
        'store3u': 'store1234(simd_ext, from_typ, 3, False)',
        'store4u': 'store1234(simd_ext, from_typ, 4, False)',
        'andb': 'binary_op2("andb", simd_ext, from_typ)',
        'xorb': 'binary_op2("xorb", simd_ext, from_typ)',
        'orb': 'binary_op2("orb", simd_ext, from_typ)',
        'andl': 'logical_op2("andl", simd_ext, from_typ)',
        'xorl': 'logical_op2("xorl", simd_ext, from_typ)',
        'orl': 'logical_op2("orl", simd_ext, from_typ)',
        'notb': 'not1(simd_ext, from_typ)',
        'notl': 'lnot1(simd_ext, from_typ)',
        'andnotb': 'binary_op2("andnotb", simd_ext, from_typ)',
        'andnotl': 'logical_op2("andnotl", simd_ext, from_typ)',
        'add': 'simple_op2("add", simd_ext, from_typ)',
        'adds': 'add_sub_s("adds",simd_ext, from_typ)',
        'sub': 'simple_op2("sub", simd_ext, from_typ)',
        'subs': 'add_sub_s("subs",simd_ext, from_typ)',
        'div': 'div2(simd_ext, from_typ)',
        'sqrt': 'sqrt1(simd_ext, from_typ)',
        'len': 'len1(simd_ext, from_typ)',
        'mul': 'simple_op2("mul", simd_ext, from_typ)',
        'shl': 'shift2("shl", simd_ext, from_typ)',
        'shr': 'shift2("shr", simd_ext, from_typ)',
        'shra': 'shift2("shra", simd_ext, from_typ)',
        'set1': 'set1(simd_ext, from_typ)',
        'set1l': 'lset1(simd_ext, from_typ)',
        'eq': 'cmp2("eq", simd_ext, from_typ)',
        'lt': 'cmp2("lt", simd_ext, from_typ)',
        'le': 'cmp2("le", simd_ext, from_typ)',
        'gt': 'cmp2("gt", simd_ext, from_typ)',
        'ge': 'cmp2("ge", simd_ext, from_typ)',
        'ne': 'cmp2("ne", simd_ext, from_typ)',
        'if_else1': 'if_else3(simd_ext, from_typ)',
        'min': 'minmax2("min", simd_ext, from_typ)',
        'max': 'minmax2("max", simd_ext, from_typ)',
        'loadla': 'loadl(True, simd_ext, from_typ)',
        'loadlu': 'loadl(False, simd_ext, from_typ)',
        'storela': 'storel(True, simd_ext, from_typ)',
        'storelu': 'storel(False, simd_ext, from_typ)',
        'abs': 'abs1(simd_ext, from_typ)',
        'fma': 'fma("fma", simd_ext, from_typ)',
        'fnma': 'fma("fnma", simd_ext, from_typ)',
        'fms': 'fma("fms", simd_ext, from_typ)',
        'fnms': 'fma("fnms", simd_ext, from_typ)',
        'ceil': 'round1("ceil", simd_ext, from_typ)',
        'floor': 'round1("floor", simd_ext, from_typ)',
        'trunc': 'round1("trunc", simd_ext, from_typ)',
        'round_to_even': 'round1("round_to_even", simd_ext, from_typ)',
        'all': 'allany1("all", simd_ext, from_typ)',
        'any': 'allany1("any", simd_ext, from_typ)',
        'reinterpret': 'reinterpret1(simd_ext, from_typ, to_typ)',
        'reinterpretl': 'reinterpretl1(simd_ext, from_typ, to_typ)',
        'cvt': 'convert1(simd_ext, from_typ, to_typ)',
        'rec8': 'recs1("rec8", simd_ext, from_typ)',
        'rec11': 'recs1("rec11", simd_ext, from_typ)',
        'rsqrt8': 'recs1("rsqrt8", simd_ext, from_typ)',
        'rsqrt11': 'recs1("rsqrt11", simd_ext, from_typ)',
        'rec': 'recs1("rec", simd_ext, from_typ)',
        'neg': 'neg1(simd_ext, from_typ)',
        'nbtrue': 'nbtrue1(simd_ext, from_typ)',
        'reverse': 'reverse1(simd_ext, from_typ)',
        'addv': 'addv(simd_ext, from_typ)',
        'upcvt': 'upcvt1(simd_ext, from_typ, to_typ)',
        'downcvt': 'downcvt1(simd_ext, from_typ, to_typ)',
        'iota': 'iota(simd_ext, from_typ)',
        'to_logical': 'to_logical(simd_ext, from_typ)',
        'mask_for_loop_tail': 'mask_for_loop_tail(simd_ext, from_typ)',
        'masko_loadu1': 'maskoz_load("o", simd_ext, from_typ)',
        'maskz_loadu1': 'maskoz_load("z", simd_ext, from_typ)',
        'masko_loada1': 'maskoz_load("o", simd_ext, from_typ)',
        'maskz_loada1': 'maskoz_load("z", simd_ext, from_typ)',
        'mask_storea1': 'mask_store(simd_ext, from_typ)',
        'mask_storeu1': 'mask_store(simd_ext, from_typ)',
        'gather': 'gather(simd_ext, from_typ)',
        'scatter': 'scatter(simd_ext, from_typ)',
        'gather_linear': 'gather_linear(simd_ext, from_typ)',
        'scatter_linear': 'scatter_linear(simd_ext, from_typ)',
        'to_mask': 'to_mask(simd_ext, from_typ)',
        'ziplo': 'zip("ziplo", simd_ext, from_typ)',
        'ziphi': 'zip("ziphi", simd_ext, from_typ)',
        'zip': 'zip_unzip_basic("zip", simd_ext, from_typ)',
        'unzip': 'zip_unzip_basic("unzip", simd_ext, from_typ)',
        'unziplo': 'unzip("unziplo", simd_ext, from_typ)',
        'unziphi': 'unzip("unziphi", simd_ext, from_typ)'
    }
    if simd_ext not in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if not from_typ in common.types:
        raise ValueError('Unknown type "{}"'.format(from_typ))
    if not func in impls:
        return common.NOT_IMPLEMENTED
    else:
        return eval(impls[func])


================================================
FILE: egg/platform_x86.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# This file gives the implementation of platform x86, i.e. Intel/AMD SIMD.
# Reading this file is NOT straightforward. X86 SIMD extensions is a mess.
# This script nonetheless tries to be as readable as possible. It implements
# SSE2, SSE42, AVX, AVX2, AVX512 as found on KNLs and AVX512 as found on Xeon
# Skylakes.

import common
import x86_load_store_deg234 as ldst234

# -----------------------------------------------------------------------------
# Helpers

sse = ['sse2', 'sse42']
avx = ['avx', 'avx2']
avx512 = ['avx512_knl', 'avx512_skylake']

# -----------------------------------------------------------------------------
# Implementation of mandatory functions for this module


def get_simd_exts():
    return ['sse2', 'sse42', 'avx', 'avx2', 'avx512_knl', 'avx512_skylake']


def get_prev_simd_ext(simd_ext):
    if simd_ext == 'sse2':
        return 'cpu'
    elif simd_ext == 'sse42':
        return 'sse2'
    elif simd_ext == 'avx':
        return 'sse42'
    elif simd_ext == 'avx2':
        return 'avx'
    elif simd_ext in avx512:
        return 'avx2'
    raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))


def emulate_fp16(simd_ext):
    if not simd_ext in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return True


def get_native_typ(simd_ext, typ):
    # Number of bits
    if simd_ext in sse:
        bits = '128'
    elif simd_ext in avx:
        bits = '256'
    elif simd_ext in avx512:
        bits = '512'
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if typ == 'f32':
        return '__m{}'.format(bits)
    elif typ == 'f64':
        return '__m{}d'.format(bits)
    elif typ in common.iutypes:
        return '__m{}i'.format(bits)


def get_type(opts, simd_ext, typ, nsimd_typ):
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    if typ == 'f16':
        return 'typedef struct {{{t} v0; {t} v1; }} {nsimd_typ};'. \
               format(t=get_native_typ(simd_ext, 'f32'), nsimd_typ=nsimd_typ)
    else:
        return 'typedef {} {};'.format(get_native_typ(simd_ext, typ),
                                       nsimd_typ)


def get_logical_type(opts, simd_ext, typ, nsimd_typ):
    if typ not in common.types:
        raise ValueError('Unknown type "{}"'.format(typ))
    if simd_ext in sse + avx:
        return get_type(opts, simd_ext, typ, nsimd_typ)
    elif simd_ext in avx512:
        if typ == 'f16':
            return 'typedef struct {{ __mmask16 v0; __mmask16 v1; }} {};'. \
                   format(nsimd_typ)
        return 'typedef __mmask{} {};'. \
               format(512 // common.bitsize(typ), nsimd_typ)
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))


def get_nb_registers(simd_ext):
    if simd_ext in sse + avx:
        return '16'
    elif simd_ext in avx512:
        return '32'
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))


def has_compatible_SoA_types(simd_ext):
    if simd_ext not in sse + avx + avx512:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    else:
        return False


def get_additional_include(func, platform, simd_ext):
    ret = ''
    if simd_ext == 'sse2':
        ret += '''#include <nsimd/cpu/cpu/{}.h>
                  '''.format(func)
    elif simd_ext == 'sse42':
        ret += '''#include <nsimd/x86/sse2/{}.h>
                  '''.format(func)
    elif simd_ext == 'avx':
        ret += '''#include <nsimd/x86/sse42/{}.h>
                  '''.format(func)
    elif simd_ext == 'avx2':
        ret += '''#include <nsimd/x86/avx/{}.h>
                  '''.format(func)
    elif simd_ext == 'avx512_knl':
        ret += '''#include <nsimd/x86/avx2/{}.h>
                  '''.format(func)
    elif simd_ext == 'avx512_skylake':
        ret += '''#include <nsimd/x86/avx2/{}.h>
                  '''.format(func)
    if func == 'shra':
        ret += '''#include <nsimd/x86/{simd_ext}/shr.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['loadla', 'loadlu', 'storela', 'storelu']:
        ret += '''#include <nsimd/x86/{simd_ext}/set1.h>
                  # include <nsimd/x86/{simd_ext}/eq.h>
                  # include <nsimd/x86/{simd_ext}/notl.h>
                  # include <nsimd/x86/{simd_ext}/if_else1.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['masko_loada1', 'masko_loadu1', 'maskz_loada1',
                'maskz_loadu1', 'mask_storea1', 'mask_storeu1']:
        ret += '''#include <nsimd/scalar_utilities.h>
                  '''
    if func in ['notb']:
        ret += '''#include <nsimd/x86/{simd_ext}/andnotb.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['notl']:
        ret += '''#include <nsimd/x86/{simd_ext}/andnotb.h>
                  # include <nsimd/x86/{simd_ext}/andnotl.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['min', 'max']:
        ret += '''#include <nsimd/x86/{simd_ext}/gt.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['lt']:
        ret += '''#include <nsimd/x86/{simd_ext}/gt.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['ge']:
        ret += '''#include <nsimd/x86/{simd_ext}/lt.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['if_else1']:
        ret += '''#include <nsimd/x86/{simd_ext}/notb.h>
                  # include <nsimd/x86/{simd_ext}/orb.h>
                  # include <nsimd/x86/{simd_ext}/andnotb.h>
                  # include <nsimd/x86/{simd_ext}/andb.h>
                  '''.format(simd_ext=simd_ext)
    if func in ['abs']:
        ret += '''#include <nsimd/x86/{simd_ext}/if_else1.h>
                  # include <nsimd/x86/{simd_ext}/set1.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'reinterpretl' and simd_ext in ['sse', 'avx']:
        ret += '''#include <nsimd/x86/{simd_ext}/storeu.h>
                  # include <nsimd/x86/{simd_ext}/loadu.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'upcvt':
        ret += '''#include <nsimd/x86/{simd_ext}/cvt.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'ziplo' and simd_ext in ['avx512_knl', 'avx512_skylake']:
        ret += '''#include <nsimd/x86/avx2/ziphi.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'ziphi' and simd_ext in ['avx512_knl', 'avx512_skylake']:
        ret += '''#include <nsimd/x86/avx2/ziplo.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'zip':
        ret += '''#include <nsimd/x86/{simd_ext}/ziplo.h>
                  #include <nsimd/x86/{simd_ext}/ziphi.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'unzip':
        ret += '''#include <nsimd/x86/{simd_ext}/unziplo.h>
                  #include <nsimd/x86/{simd_ext}/unziphi.h>
                  '''.format(simd_ext=simd_ext)
    if simd_ext in avx512 and func in ['loadlu', 'loadla']:
        ret += '''
                  # if NSIMD_CXX > 0
                  extern "C" {{
                  # endif

                  NSIMD_INLINE nsimd_{simd_ext}_vlu16 NSIMD_VECTORCALL
                  nsimd_{func}_{simd_ext}_u16(const u16*);

                  # if NSIMD_CXX > 0
                  }} // extern "C"
                  # endif
                  '''.format(func=func, simd_ext=simd_ext)
    if func in ['load2u', 'load3u', 'load4u', 'load2a', 'load3a', 'load4a']:
        ret += '''
                  # include <nsimd/x86/{simd_ext}/loadu.h>
                  # include <nsimd/x86/{simd_ext}/storeu.h>

                  # if NSIMD_CXX > 0
                  extern "C" {{
                  # endif

                  NSIMD_INLINE nsimd_{simd_ext}_vu16x{deg} NSIMD_VECTORCALL
                  nsimd_{func}_{simd_ext}_u16(const u16*);

                  # if NSIMD_CXX > 0
                  }} // extern "C"
                  # endif
                  '''.format(func=func, deg=func[4], simd_ext=simd_ext)
    if func in ['store2u', 'store3u', 'store4u', 'store2a', 'store3a',
                'store4a']:
        deg = func[5]
        args = ','.join(['nsimd_{simd_ext}_vu16'.format(simd_ext=simd_ext)
                         for i in range(1, int(deg) + 1)])
        ret += '''
                  # include <nsimd/x86/{simd_ext}/loadu.h>
                  # include <nsimd/x86/{simd_ext}/storeu.h>

                  # if NSIMD_CXX > 0
                  extern "C" {{
                  # endif

                  NSIMD_INLINE void NSIMD_VECTORCALL
                  nsimd_{func}_{simd_ext}_u16(u16*, {args});

                  # if NSIMD_CXX > 0
                  }} // extern "C"
                  # endif
                  '''.format(func=func, deg=deg, args=args, simd_ext=simd_ext)
    if func == 'to_logical':
        ret += '''#include <nsimd/x86/{simd_ext}/ne.h>
                  #include <nsimd/x86/{simd_ext}/reinterpretl.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'adds':
        ret += '''#include <nsimd/x86/{simd_ext}/reinterpret.h>
                  #include <nsimd/x86/{simd_ext}/add.h>
                  #include <nsimd/x86/{simd_ext}/set1.h>
                  #include <nsimd/x86/{simd_ext}/shr.h>
                  #include <nsimd/x86/{simd_ext}/orb.h>
                  #include <nsimd/x86/{simd_ext}/xorb.h>
                  #include <nsimd/x86/{simd_ext}/notb.h>
                  #include <nsimd/x86/{simd_ext}/if_else1.h>

                  #if NSIMD_CXX > 0
                      #include <climits>
                  #else
                      #include <limits.h>
                  #endif
                  ''' .format(simd_ext=simd_ext)
        if simd_ext in avx512:
            ret += '''#include <nsimd/x86/{simd_ext}/to_logical.h>
                      '''.format(simd_ext=simd_ext)
    if func == 'subs':
        ret += '''#include <nsimd/x86/{simd_ext}/adds.h>
                  #include <nsimd/x86/{simd_ext}/neg.h>
                  #include <nsimd/x86/{simd_ext}/sub.h>
                  #include <nsimd/x86/{simd_ext}/gt.h>
                  #include <nsimd/x86/{simd_ext}/set1.h>
                  #include <nsimd/x86/{simd_ext}/if_else1.h>
                  '''.format(simd_ext=simd_ext)
    if func == 'mask_for_loop_tail':
        ret += '''#include <nsimd/x86/{simd_ext}/lt.h>
                  #include <nsimd/x86/{simd_ext}/set1l.h>
                  #include <nsimd/x86/{simd_ext}/iota.h>
                  #include <nsimd/x86/{simd_ext}/set1.h>
                  '''.format(simd_ext=simd_ext)
    return ret

# -----------------------------------------------------------------------------
# Function prefixes and suffixes

def pre(simd_ext):
    # Number of bits
    if simd_ext in sse:
        bits = ''
    elif simd_ext in avx:
        bits = '256'
    elif simd_ext in avx512:
        bits = '512'
    else:
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    return '_mm{}_'.format(bits)

def suf_ep(typ):
    if typ == 'f16':
        return '_ph'
    elif typ == 'f32':
        return '_ps'
    elif typ == 'f64':
        return '_pd'
    elif typ in common.iutypes:
        return '_epi{}'.format(typ[1:])
    else:
        raise ValueError('Unknown type "{}"'.format(typ))

def nbits(simd_ext):
    if simd_ext in sse:
        return '128'
    elif simd_ext in avx:
        return '256'
    else:
        return '512'

def suf_si(simd_ext, typ):
    if typ == 'f16':
        return '_ph'
    elif typ == 'f32':
        return '_ps'
    elif typ == 'f64':
        return '_pd'
    elif typ in common.iutypes:
        return '_si{}'.format(nbits(simd_ext))
    else:
        raise ValueError('Unknown type "{}"'.format(typ))

# -----------------------------------------------------------------------------
# Other helper functions

fmtspec = {}

LO = 0
HI = 1

def castsi(simd_ext, typ):
    if typ in common.ftypes:
        return ''
    else:
        return '(__m{}i *)'.format(nbits(simd_ext))

def extract(simd_ext, typ, lohi, var):
    if simd_ext in avx:
        lohi_arg = '0' if lohi == LO else '1'
        if typ == 'f32':
            if lohi == LO:
                return '_mm256_castps256_ps128({})'.format(var)
            else:
                return '_mm256_extractf128_ps({}, 1)'.format(var)
        elif typ == 'f64':
            if lohi == LO:
                return '_mm256_castpd256_pd128({})'.format(var)
            else:
                return '_mm256_extractf128_pd({}, 1)'.format(var)
        else:
            if lohi == LO:
                return '_mm256_castsi256_si128({})'.format(var)
            else:
                return '_mm256_extractf128_si256({}, 1)'.format(var)
    elif simd_ext in avx512:
        lohi_arg = '0' if lohi == LO else '1'
        if typ == 'f32':
            if lohi == LO:
                return '_mm512_castps512_ps256({})'.format(var)
            else:
                return '''_mm256_castsi256_ps(_mm512_extracti64x4_epi64(
                              _mm512_castps_si512({}), 1))'''.format(var)
        elif typ == 'f64':
            if lohi == LO:
                return '_mm512_castpd512_pd256({})'.format(var)
            else:
                return '_mm512_extractf64x4_pd({}, 1)'.format(var)
        else:
            if lohi == LO:
                return '_mm512_castsi512_si256({})'.format(var)
            else:
                return '_mm512_extracti64x4_epi64({}, 1)'.format(var)

def setr(simd_ext, typ, var1, var2):
    if simd_ext in avx:
        if typ == 'f32':
            return '''_mm256_insertf128_ps(_mm256_castps128_ps256(
                        {}), {}, 1)'''.format(var1, var2)
        elif typ == 'f64':
            return '''_mm256_insertf128_pd(_mm256_castpd128_pd256(
                        {}), {}, 1)'''.format(var1, var2)
        else:
            return '''_mm256_insertf128_si256(_mm256_castsi128_si256(
                        {}), {}, 1)'''.format(var1, var2)
    elif simd_ext in avx512:
        if typ == 'f32':
            return '''_mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(
                        _mm512_castps256_ps512({})), _mm256_castps_pd(
                          {}), 1))'''. \
                      format(var1, var2)
        elif typ == 'f64':
            return '''_mm512_insertf64x4(_mm512_castpd256_pd512(
                        {}), {}, 1)'''.format(var1, var2)
        else:
            return '''_mm512_inserti64x4(_mm512_castsi256_si512(
                        {}), {}, 1)'''.format(var1, var2)

def set_lane(simd_ext, typ, var_name, scalar, i):
    # No code for f16's
    if typ == 'f16':
        return ''

    # Inserting a

    # Code for reinterpreting bits of input:
    #   All intrinscis manipulates only integers. So we use them.
    if typ in ['u8', 'u16']:
        vin0 = var_name
        if simd_ext in sse:
            vin1 = '(int)({})'.format(scalar)
        else:
            vin1 = scalar
    if typ in ['i8', 'i16']:
        vin0 = var_name
        vin1 =  '(int)nsimd_scalar_reinterpret_{}_{}({})'. \
               format('u' + typ[1:], typ, scalar)
    elif typ in ['i32', 'i64']:
        vin0 = var_name
        vin1 = scalar
    elif typ in ['u32', 'f32', 'u64', 'f64']:
        if typ in ['u32', 'u64']:
            vin0 = var_name
        else:
            vin0 = '{pre}cast{pspd}_si{nbits}({var_name})'. \
                   format(pspd='ps' if typ == 'f32' else 'pd',
                          var_name=var_name, **fmtspec)
        vin1 = 'nsimd_scalar_reinterpret_{}_{}({})'. \
               format('i' + typ[1:], typ, scalar)

    # Code for inserting bits
    if simd_ext == 'sse2':
        if typ[1:] == '8':
            if i % 2 == 0:
                tmp = '_mm_insert_epi16({vin0}, ' \
                      '(_mm_extract_epi16({vin0}, {io2}) & 65280) | {vin1}, ' \
                      '{io2})'.format(vin0=vin0, vin1=vin1, io2=int(i // 2))
            else:
                tmp = '_mm_insert_epi16({vin0}, ' \
                      '(_mm_extract_epi16({vin0}, {io2}) & 255) | ' \
                      '({vin1} << 8), {io2})'. \
                      format(vin0=vin0, vin1=vin1, io2=int(i // 2))
        if typ[1:] == '16':
            tmp = '_mm_insert_epi16({}, {}, {})'.format(vin0, vin1, i)
        if typ[1:] == '32':
            tmp = '_mm_insert_epi16(_mm_insert_epi16({vin0}, {vin1} & 65535,' \
                  ' {ix2}), (int)nsimd_scalar_reinterpret_u32_i32(' \
                  '{vin1}) >> 16, {ix2p1})'.format(vin0=vin0, vin1=vin1,
                  ix2=i * 2, ix2p1=(i * 2) + 1)
        if typ[1:] == '64':
            if i == 0:
                tmp = '_mm_unpackhi_epi64(_mm_slli_si128(' \
                      '_mm_cvtsi64_si128({vin1}), 8), {vin0})'. \
                      format(vin0=vin0, vin1=vin1)
            elif i == 1:
                tmp = '_mm_unpacklo_epi64({vin0}, ' \
                      '_mm_cvtsi64_si128({vin1}))'.format(vin0=vin0, vin1=vin1)
    elif simd_ext in ['sse42'] + avx:
        tmp = '{pre}insert_epi{typnbits}({vin0}, {vin1}, {i})'. \
              format(vin0=vin0, vin1=vin1, i=i, **fmtspec)
    elif simd_ext in avx512:
        half = int(nbits(simd_ext)) // 2 // int(typ[1:])
        if i < half:
            tmp = '_mm512_inserti64x4({vin0}, _mm256_insert_epi{typnbits}(' \
                  '_mm512_castsi512_si256({vin0}), {vin1}, {i}), 0)'. \
                  format(vin0=vin0, vin1=vin1, i=i, **fmtspec)
        else:
            tmp = '_mm512_inserti64x4({vin0}, _mm256_insert_epi{typnbits}(' \
                  '_mm512_extracti64x4_epi64({vin0}, 1), {vin1}, {i}),' \
                  ' 1)'.format(vin0=vin0, vin1=vin1, i=i - half, **fmtspec)

    # Then code for reinterpreting bits of output:
    if typ in common.iutypes:
        return '{} = {};'.format(var_name, tmp)
    elif typ in ['f32', 'f64']:
        return '{var_name} = {pre}castsi{nbits}_{pdps}({tmp});'. \
               format(var_name=var_name, pdps='ps' if typ == 'f32' else 'pd',
                      tmp=tmp, **fmtspec)

def get_lane(simd_ext, typ, var_name, i):
    # No code for f16's
    if typ == 'f16':
        return ''

    # Code for reinterpreting bits of input:
    #   All intrinscis manipulates only integers. So we use them.
    if typ in common.iutypes:
        vin = var_name
    elif typ in ['f32', 'f64']:
        vin = '{pre}cast{pdps}_si{nbits}({v})'. \
              format(pdps='ps' if typ == 'f32' else 'pd', v=var_name,
                     **fmtspec)

    # Code for extracting bits
    if simd_ext == 'sse2':
        if typ[1:] == '8':
            lane = '(_mm_cvtsi128_si32(_mm_srli_si128({vin}, {i})) & 255)'. \
                   format(vin=vin, i=i, **fmtspec)
        if typ[1:] == '16':
            lane = '_mm_extract_epi16({}, {})'.format(vin, i)
        if typ[1:] in ['32', '64']:
            lane = '(_mm_cvtsi128_si{}(_mm_srli_si128({}, {})))'. \
                   format(typ[1:], vin, i * int(typ[1:]) // 8)
    elif simd_ext in ['sse42', 'avx2']:
        lane = '{pre}extract_epi{typnbits}({vin}, {i})'. \
               format(vin=vin, i=i, **fmtspec)
    elif simd_ext in ['avx'] + avx512:
        if simd_ext == 'avx' and typ[1:] in ['32', '64']:
            lane = '{pre}extract_epi{typnbits}({vin}, {i})'. \
                   format(vin=vin, i=i, **fmtspec)
        else:
            half = int(nbits(simd_ext)) // 2 // int(typ[1:])
            if i < half:
                ext_half = extract(simd_ext, 'i' + typ[1:], LO, vin)
                lane = '{}extract_epi{}({}, {})'.format(
                           '_mm_' if simd_ext == 'avx' else '_mm256_',
                           typ[1:], ext_half, i)
            else:
                ext_half = extract(simd_ext, 'i' + typ[1:], HI, vin)
                lane = '{}extract_epi{}({}, {})'.format(
                           '_mm_' if simd_ext == 'avx' else '_mm256_',
                           typ[1:], ext_half, i - half)

    # Then code for reinterpreting bits of output:
    #   - For 8 and 16-bits types intrinsics returns an 32-bits int
    #   - For 32 and 64-bits types intrinsics returns an int of that size
    if typ in ['u8', 'u16']:
        return '({})({})'.format(typ, lane)
    if typ in ['i8', 'i16']:
        return 'nsimd_scalar_reinterpret_{}_{}(({})({}))'. \
               format(typ, 'u' + typ[1:], 'u' + typ[1:], lane)
    elif typ in ['i32', 'i64']:
        return lane
    elif typ in ['u32', 'f32', 'u64', 'f64']:
        return 'nsimd_scalar_reinterpret_{}_{}({})'. \
               format(typ, 'i' + typ[1:], lane)

def get_undefined(simd_ext, typ):
    if typ in ['f32', 'f64']:
        return '{pre}undefined{suf}()'.format(**fmtspec)
    elif typ in common.iutypes:
        if simd_ext in sse + avx:
            return '{pre}undefined{sufsi}()'.format(**fmtspec)
        elif simd_ext in avx512:
            return '{pre}undefined_epi32()'.format(**fmtspec)

# Signature must be a list of 'v', 's'
#   'v' means vector so code to extract has to be emitted
#   's' means base type so no need to write code for extraction
def get_emulation_code(func, signature, simd_ext, typ):
    # Trick using insert and extract
    trick = 'nsimd_{simd_ext}_v{typ} ret = {undef};\n'. \
           format(undef=get_undefined(simd_ext, typ), **fmtspec)
    arity = len(signature)
    trick += typ + ' ' + \
            ', '.join(['tmp{}'.format(i) \
                       for i in range(arity) if signature[i] == 'v']) + ';\n'
    args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                      if signature[i] == 's' else 'tmp{}'.format(i) \
                      for i in range(arity)])
    for i in range(fmtspec['le']):
        trick += '\n'.join(['tmp{} = {};'. \
                format(j, get_lane(simd_ext, typ,
                       '{{in{}}}'.format(j).format(**fmtspec), i)) \
                       for j in range(arity) if signature[j] == 'v']) + '\n'
        trick += set_lane(simd_ext, typ, 'ret',
                         'nsimd_scalar_{func}_{typ}({args})'. \
                         format(func=func, args=args, **fmtspec), i) + '\n'
    trick += 'return ret;'

    # but in 32-bits mode insert and extract instrinsics are almost never
    # available so we emulate
    emulation = 'int i;\n{typ} ret[{le}];\n'.format(**fmtspec)
    emulation += typ + ' ' + \
                 ', '.join(['buf{}[{}]'.format(i, fmtspec['le']) \
                            for i in range(arity) if signature[i] == 'v']) + \
                            ';\n'
    emulation += '\n'.join(['{{pre}}store{{sufsi}}({cast}buf{i}, {{in{i}}});'. \
                            format(i=i, cast=castsi(simd_ext, typ)). \
                            format(**fmtspec) \
                            for i in range(arity) if signature[i] == 'v']) + \
                            '\n'
    args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                      if signature[i] == 's' else 'buf{}[i]'.format(i) \
                      for i in range(arity)])
    emulation += '''for (i = 0; i < {le}; i++) {{
                      ret[i] = nsimd_scalar_{func}_{typ}({args});
                    }}
                    return {pre}loadu{sufsi}({cast}ret);'''. \
                    format(args=args, cast=castsi(simd_ext, typ), func=func,
                           **fmtspec)

    if simd_ext == 'sse42' and \
       typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32', 'f32']:
        return trick
    else:
        return '''#if NSIMD_WORD_SIZE == 32
                    {}
                  #else
                    {}
                  #endif'''.format(emulation, trick)

def how_it_should_be_op2(func, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = {pre}{func}_ps({in0}.v0, {in1}.v0);
                  ret.v1 = {pre}{func}_ps({in0}.v1, {in1}.v1);
                  return ret;'''.format(func=func, **fmtspec)
    else:
        return 'return {pre}{func}{suf}({in0}, {in1});'. \
               format(func=func, **fmtspec)

def split_opn(func, simd_ext, typ, n):
    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'
    inp = [common.in0, common.in1, common.in2]
    defi = ''
    for i in range(0, n):
        defi += \
        '''nsimd_{simd_ext2}_v{typ} v{i}0 = {extract_loi};
           nsimd_{simd_ext2}_v{typ} v{i}1 = {extract_hii};'''. \
           format(simd_ext2=simd_ext2, typ=typ, i=i,
                  extract_loi=extract(simd_ext, typ, LO, inp[i]),
                  extract_hii=extract(simd_ext, typ, HI, inp[i]))
    vlo = ', '.join(['v{}0'.format(i) for i in range(0, n)])
    vhi = ', '.join(['v{}1'.format(i) for i in range(0, n)])
    return '''{defi}
              v00 = nsimd_{func}_{simd_ext2}_{typ}({vlo});
              v01 = nsimd_{func}_{simd_ext2}_{typ}({vhi});
              return {merge};'''. \
              format(defi=defi, vlo=vlo, vhi=vhi,
                     func=func, simd_ext2=simd_ext2, typ=typ,
                     merge=setr(simd_ext, typ, 'v00', 'v01'))

def split_op2(func, simd_ext, typ):
    return split_opn(func, simd_ext, typ, 2)

def emulate_op2(opts, op, simd_ext, typ):
    func = {'/': 'div', '*': 'mul'}
    return get_emulation_code(func[op], ['v', 'v'], simd_ext, typ)

def emulate_op1(opts, func, simd_ext, typ):
    return get_emulation_code(func, ['v'], simd_ext, typ)

def split_cmp2(func, simd_ext, typ):
    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'
    leo2 = int(fmtspec['le']) // 2
    if simd_ext in avx512:
        if typ in ['i8', 'u8', 'f32', 'f64']:
            merge = \
            '''return (__mmask{le})(u32)_mm256_movemask{suf}(
                        v00) | ((__mmask{le})(u32)_mm256_movemask{suf}(
                          v01) << {leo2});'''. \
                       format(leo2=leo2, **fmtspec)
        elif typ in ['i32', 'u32', 'i64', 'u64']:
            ftyp = 'f{typnbits}'.format(**fmtspec)
            merge = \
            '''return (__mmask{le})(u32)_mm256_movemask{fsuf}(
                        _mm256_castsi256{suf}(v00)) |
                          (((__mmask{le})(u32)_mm256_movemask{fsuf}(
                            _mm256_castsi256{suf}(v01))) << {leo2});'''. \
                            format(fsuf=suf_ep(ftyp), leo2=leo2, **fmtspec)
        else:
            merge = \
            '''v00 = _mm256_permute4x64_epi64(v00, 216); /* exchange middle qwords */
               nsimd_avx2_vi16 lo1 = _mm256_unpacklo_epi16(v00, v00);
               nsimd_avx2_vi16 hi1 = _mm256_unpackhi_epi16(v00, v00);
               v01 = _mm256_permute4x64_epi64(v01, 216); /* exchange middle qwords */
               nsimd_avx2_vi16 lo2 = _mm256_unpacklo_epi16(v01, v01);
               nsimd_avx2_vi16 hi2 = _mm256_unpackhi_epi16(v01, v01);
               return (__mmask32)(u32)_mm256_movemask_ps(
                                   _mm256_castsi256_ps(lo1)) |
                      (__mmask32)((u32)_mm256_movemask_ps(
                                   _mm256_castsi256_ps(hi1)) << 8) |
                      (__mmask32)((u32)_mm256_movemask_ps(
                                   _mm256_castsi256_ps(lo2)) << 16) |
                      (__mmask32)((u32)_mm256_movemask_ps(
                                   _mm256_castsi256_ps(hi2)) << 24);'''. \
                                   format(**fmtspec)
    else:
        merge = 'return {};'.format(setr(simd_ext, typ, 'v00', 'v01'))
    return '''nsimd_{simd_ext2}_v{typ} v00 = {extract_lo0};
              nsimd_{simd_ext2}_v{typ} v01 = {extract_hi0};
              nsimd_{simd_ext2}_v{typ} v10 = {extract_lo1};
              nsimd_{simd_ext2}_v{typ} v11 = {extract_hi1};
              v00 = nsimd_{func}_{simd_ext2}_{typ}(v00, v10);
              v01 = nsimd_{func}_{simd_ext2}_{typ}(v01, v11);
              {merge}'''. \
              format(simd_ext2=simd_ext2,
                     extract_lo0=extract(simd_ext, typ, LO, common.in0),
                     extract_hi0=extract(simd_ext, typ, HI, common.in0),
                     extract_lo1=extract(simd_ext, typ, LO, common.in1),
                     extract_hi1=extract(simd_ext, typ, HI, common.in1),
                     func=func, merge=merge, **fmtspec)

def f16_cmp2(func, simd_ext):
    return '''nsimd_{simd_ext}_vlf16 ret;
              ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in1}.v0);
              ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1, {in1}.v1);
              return ret;'''.format(func=func, **fmtspec)

def cmp2_with_add(func, simd_ext, typ):
    cte = { 'u8': '0x80', 'u16': '0x8000', 'u32': '0x80000000',
            'u64': '0x8000000000000000' }
    return \
    '''nsimd_{simd_ext}_v{typ} cte = nsimd_set1_{simd_ext}_{typ}({cte});
       return nsimd_{func}_{simd_ext}_{ityp}(
                {pre}add{suf}({in0}, cte),
                {pre}add{suf}({in1}, cte));'''. \
                format(func=func, cte=cte[typ],
                       ityp='i{}'.format(typ[1:]), **fmtspec)

# -----------------------------------------------------------------------------
# Returns C code for func

# Load

def load(simd_ext, typ, aligned):
    align = '' if aligned else 'u'
    cast = castsi(simd_ext, typ)
    if typ == 'f16':
        if simd_ext in sse:
            return \
            '''#ifdef NSIMD_FP16
                 nsimd_{simd_ext}_vf16 ret;
                 __m128i v = _mm_load{align}_si128((__m128i*){in0});
                 ret.v0 = _mm_cvtph_ps(v);
                 v = _mm_shuffle_epi32(v, 14); /* = (3 << 2) | (2 << 0) */
                 ret.v1 = _mm_cvtph_ps(v);
                 return ret;
               #else
                 /* Note that we can do much better but is it useful? */
                 nsimd_{simd_ext}_vf16 ret;
                 f32 buf[4];
                 buf[0] = nsimd_u16_to_f32(*(u16*){in0});
                 buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 1));
                 buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 2));
                 buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 3));
                 ret.v0 = _mm_loadu_ps(buf);
                 buf[0] = nsimd_u16_to_f32(*((u16*){in0} + 4));
                 buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 5));
                 buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 6));
                 buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 7));
                 ret.v1 = _mm_loadu_ps(buf);
                 return ret;
               #endif'''.format(align=align, **fmtspec)
        elif simd_ext in avx:
            return '''#ifdef NSIMD_FP16
                        nsimd_{simd_ext}_vf16 ret;
                        ret.v0 = _mm256_cvtph_ps(_mm_load{align}_si128(
                                   (__m128i*){in0}));
                        ret.v1 = _mm256_cvtph_ps(_mm_load{align}_si128(
                                   (__m128i*){in0} + 1));
                        return ret;
                      #else
                        /* Note that we can do much better but is it useful? */
                        nsimd_{simd_ext}_vf16 ret;
                        f32 buf[8];
                        int i;
                        for (i = 0; i < 8; i++) {{
                          buf[i] = nsimd_u16_to_f32(*((u16*){in0} + i));
                        }}
                        ret.v0 = _mm256_loadu_ps(buf);
                        for (i = 0; i < 8; i++) {{
                          buf[i] = nsimd_u16_to_f32(*((u16*){in0} + (8 + i)));
                        }}
                        ret.v1 = _mm256_loadu_ps(buf);
                        return ret;
                      #endif'''.format(align=align, **fmtspec)
        elif simd_ext in avx512:
            return '''nsimd_{simd_ext}_vf16 ret;
                      ret.v0 = _mm512_cvtph_ps(
                                 _mm256_load{align}_si256((__m256i*){in0})
                               );
                      ret.v1 = _mm512_cvtph_ps(
                                 _mm256_load{align}_si256((__m256i*){in0} + 1)
                               );
                      return ret;
                      '''.format(align=align, **fmtspec)
    else:
        return 'return {pre}load{align}{sufsi}({cast}{in0});'. \
               format(align=align, cast=cast, **fmtspec)

# -----------------------------------------------------------------------------
# masked loads

def maskoz_load(simd_ext, typ, oz, aligned):
    if typ == 'f16':
        le2 = fmtspec['le'] // 2
        if simd_ext in sse + avx:
            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);
                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \
                            format(le2=le2, **fmtspec)
        else:
            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(
                              {in0}.v0, _mm512_set1_ps(1.0f)));
                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(
                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \
                            format(le2=le2, **fmtspec)
        return '''int i;
                  nsimd_{simd_ext}_vf16 ret;
                  f32 buf[{le}], mask[{le}];
                  {store_mask}
                  {pre}storeu_ps(buf, {oz0});
                  {pre}storeu_ps(buf + {le2}, {oz1});
                  for (i = 0; i < {le}; i++) {{
                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{
                      buf[i] = nsimd_f16_to_f32({in1}[i]);
                    }}
                  }}
                  ret.v0 = {pre}loadu_ps(buf);
                  ret.v1 = {pre}loadu_ps(buf + {le2});
                  return ret;'''.format(le2=fmtspec['le'] // 2,
                  oz0 = '{pre}setzero_ps()'.format(**fmtspec) if oz == 'z' \
                        else '{in2}.v0'.format(**fmtspec),
                  oz1 = '{pre}setzero_ps()'.format(**fmtspec) if oz == 'z' \
                        else '{in2}.v1'.format(**fmtspec),
                  store_mask=store_mask, **fmtspec)
    if (typ in ['i8', 'u8', 'i16', 'u16'] and simd_ext != 'avx512_skylake') \
       or (typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64'] and \
           simd_ext in sse):
        cast = castsi(simd_ext, typ)
        if simd_ext == 'avx512_knl':
            mask_decl = 'u64 mask;'
            store_mask = 'mask = (u64){in0};'.format(**fmtspec)
            cond = '(mask >> i) & 1'
        else:
            mask_decl = '{typ} mask[{le}];'.format(**fmtspec)
            store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \
                         format(cast=cast, **fmtspec)
            cond = 'nsimd_scalar_reinterpret_{utyp}_{typ}(mask[i]) != '\
                   '({utyp})0'.format(utyp='u' + typ[1:], **fmtspec)
        return \
        '''int i;
           {typ} buf[{le}];
           {mask_decl}
           {pre}storeu{sufsi}({cast}buf, {oz});
           {store_mask}
           for (i = 0; i < {le}; i++) {{
             if ({cond}) {{
               buf[i] = {in1}[i];
             }}
           }}
           return {pre}loadu{sufsi}({cast}buf);'''. \
           format(cast=cast, mask_decl=mask_decl, store_mask=store_mask,
                  cond=cond, oz='{in2}'.format(**fmtspec) if oz == 'o' else \
                  '{pre}setzero{sufsi}()'.format(**fmtspec), **fmtspec)
    # Here typ is 32 of 64-bits wide except
    if simd_ext in avx:
        suf2 = 'ps' if typ[1:] == '32' else 'pd'
        if typ in common.ftypes:
            maskload = \
            '{pre}maskload{suf}({in1}, {pre}cast{suf2}_si256({in0}))'. \
            format(suf2=suf2, **fmtspec)
            if oz == 'z':
                return 'return {};'.format(maskload)
            else:
                return \
                'return {pre}blendv{suf}({in2}, {maskload}, {in0});'. \
                format(maskload=maskload, **fmtspec)
        else:
            if simd_ext == 'avx2':
                maskload = '{pre}maskload{suf}({cast}{in1}, {in0})'. \
                           format(cast='(nsimd_longlong *)' \
                                  if typ in ['i64', 'u64'] else '(int *)',
                                  **fmtspec)
                if oz == 'z':
                    return 'return {};'.format(maskload)
                else:
                    return \
                    'return {pre}blendv_epi8({in2}, {maskload}, {in0});'. \
                    format(maskload=maskload, **fmtspec)
            else:
                maskload = '{pre}maskload_{suf2}(({ftyp}*){in1}, {in0})'. \
                           format(suf2=suf2, ftyp='f' + typ[1:], **fmtspec)
                if oz == 'z':
                    return 'return {pre}cast{suf2}_si256({maskload});'. \
                           format(maskload=maskload, suf2=suf2, **fmtspec)
                else:
                    return \
                    '''return {pre}cast{suf2}_si256({pre}blendv_{suf2}(
                                {pre}castsi256_{suf2}({in2}), {maskload},
                                  {pre}castsi256_{suf2}({in0})));'''. \
                                  format(suf2=suf2, maskload=maskload,
                                         **fmtspec)
    # getting here means avx512 with intrinsics
    mask = {
      'z': 'return {pre}maskz_load{{}}{suf}({in0}, (void*){in1});'. \
           format(**fmtspec),
      'o': 'return {pre}mask_load{{}}{suf}({in2}, {in0}, (void*){in1});'. \
           format(**fmtspec)
    }
    if typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']:
        return mask[oz].format('' if aligned else 'u')
    else:
        return mask[oz].format('u')

# -----------------------------------------------------------------------------
# Loads of degree 2, 3 and 4

def load_deg234(simd_ext, typ, align, deg):
    if typ == 'f16':
        a = 'a' if align else 'u'
        code = '\n'.join([ \
               '''nsimd_storeu_{simd_ext}_u16(buf, tmp.v{i});
                  ret.v{i} = nsimd_loadu_{simd_ext}_f16((f16 *)buf);'''. \
                  format(i=i, **fmtspec) for i in range(0, deg)])
        return \
        '''nsimd_{simd_ext}_v{typ}x{deg} ret;
           u16 buf[{le}];
           nsimd_{simd_ext}_vu16x{deg} tmp =
               nsimd_load{deg}{a}_{simd_ext}_u16((u16*)a0);
           {code}
           return ret;'''.format(code=code, a=a, deg=deg, **fmtspec)
    if simd_ext in sse:
        if deg == 2:
            return ldst234.load2_sse(simd_ext, typ, align, fmtspec)
        if deg == 3:
            return ldst234.load3_sse(simd_ext, typ, align, fmtspec)
        if deg == 4:
            return ldst234.load4_sse(simd_ext, typ, align, fmtspec)
    if simd_ext in avx:
        if deg == 2:
            return ldst234.load2_avx(simd_ext, typ, align, fmtspec)
        if deg == 3:
            return ldst234.load3_avx(simd_ext, typ, align, fmtspec)
        if deg == 4:
            return ldst234.load4_avx(simd_ext, typ, align, fmtspec)
    if simd_ext in avx512:
        if deg == 2:
            return ldst234.load2_avx512(simd_ext, typ, align, fmtspec)
        if deg == 3:
            return ldst234.load3_avx512(simd_ext, typ, align, fmtspec)
        if deg == 4:
            return ldst234.load4_avx512(simd_ext, typ, align, fmtspec)
    return common.NOT_IMPLEMENTED

# -----------------------------------------------------------------------------
# Stores of degree 2, 3 and 4

def store_deg234(simd_ext, typ, align, deg):
    if typ == 'f16':
        a = 'a' if align else 'u'
        variables = ', '.join(['v{}'.format(i) for i in range(0, deg)])
        code = '\n'.join([ \
               '''nsimd_storeu_{{simd_ext}}_f16((f16 *)buf, {{in{ip1}}});
                  v{i} = nsimd_loadu_{{simd_ext}}_u16((u16 *)buf);'''. \
                  format(i=i, ip1=i + 1).format(**fmtspec) \
                  for i in range(0, deg)])
        return \
        '''nsimd_{simd_ext}_vu16 {variables};
           u16 buf[{le}];
           {code}
           nsimd_store{deg}{a}_{simd_ext}_u16((u16 *){in0}, {variables});'''. \
           format(variables=variables, code=code, a=a, deg=deg, **fmtspec)
    if simd_ext in sse:
        if deg == 2:
            return ldst234.store2(simd_ext, typ, align, fmtspec)
        if deg == 3:
            return ldst234.store3_sse(simd_ext, typ, align, fmtspec)
        if deg == 4:
            return ldst234.store4_sse(typ, align, fmtspec)
    if simd_ext in avx:
        if deg == 2:
            return ldst234.store2(simd_ext, typ, align, fmtspec)
        if deg == 3:
            return ldst234.store3_avx(simd_ext, typ, align, fmtspec)
        if deg == 4:
            return ldst234.store4_avx(simd_ext, typ, align, fmtspec)
    if simd_ext in avx512:
        if deg == 2:
            return ldst234.store2(simd_ext, typ, align, fmtspec)
        if deg == 3:
            return ldst234.store3_avx512(simd_ext, typ, align, fmtspec)
        if deg == 4:
            return ldst234.store4_avx512(simd_ext, typ, align, fmtspec)
    return common.NOT_IMPLEMENTED

# -----------------------------------------------------------------------------
# Store

def store(simd_ext, typ, aligned):
    align = '' if aligned else 'u'
    cast = castsi(simd_ext, typ)
    if typ == 'f16':
        if simd_ext in sse:
            return \
            '''#ifdef NSIMD_FP16
                 __m128i v0 = _mm_cvtps_ph({in1}.v0, 4);
                 __m128i v1 = _mm_cvtps_ph({in1}.v1, 4);
                 __m128d v = _mm_shuffle_pd(_mm_castsi128_pd(v0),
                               _mm_castsi128_pd(v1),
                                 0 /* = (0 << 1) | (0 << 0) */);
                 _mm_store{align}_pd((f64*){in0}, v);
               #else
                 /* Note that we can do much better but is it useful? */
                 f32 buf[4];
                 _mm_storeu_ps(buf, {in1}.v0);
                 *((u16*){in0}    ) = nsimd_f32_to_u16(buf[0]);
                 *((u16*){in0} + 1) = nsimd_f32_to_u16(buf[1]);
                 *((u16*){in0} + 2) = nsimd_f32_to_u16(buf[2]);
                 *((u16*){in0} + 3) = nsimd_f32_to_u16(buf[3]);
                 _mm_storeu_ps(buf, {in1}.v1);
                 *((u16*){in0} + 4) = nsimd_f32_to_u16(buf[0]);
                 *((u16*){in0} + 5) = nsimd_f32_to_u16(buf[1]);
                 *((u16*){in0} + 6) = nsimd_f32_to_u16(buf[2]);
                 *((u16*){in0} + 7) = nsimd_f32_to_u16(buf[3]);
               #endif'''.format(align=align, **fmtspec)
        elif simd_ext in avx:
            return \
            '''#ifdef NSIMD_FP16
                 _mm_store{align}_si128((__m128i*){in0},
                   _mm256_cvtps_ph({in1}.v0, 4));
                 _mm_store{align}_si128((__m128i*){in0} + 1,
                   _mm256_cvtps_ph({in1}.v1, 4));
               #else
                 /* Note that we can do much better but is it useful? */
                 int i;
                 f32 buf[8];
                 _mm256_storeu_ps(buf, {in1}.v0);
                 for (i = 0; i < 8; i++) {{
                   *((u16*){in0} + i) = nsimd_f32_to_u16(buf[i]);
                 }}
                 _mm256_storeu_ps(buf, {in1}.v1);
                 for (i = 0; i < 8; i++) {{
                   *((u16*){in0} + (8 + i)) = nsimd_f32_to_u16(buf[i]);
                 }}
               #endif'''.format(align=align, **fmtspec)
        elif simd_ext in avx512:
            return \
            '''_mm256_store{align}_si256((__m256i*){in0},
                   _mm512_cvtps_ph({in1}.v0, 4));
               _mm256_store{align}_si256((__m256i*){in0} + 1,
                   _mm512_cvtps_ph({in1}.v1, 4));'''. \
                        format(align=align, **fmtspec)
    else:
        return '{pre}store{align}{sufsi}({cast}{in0}, {in1});'. \
               format(align=align, cast=cast, **fmtspec)

# masked store

def mask_store(simd_ext, typ, aligned):
    if typ == 'f16':
        le2 = fmtspec['le'] // 2
        if simd_ext in sse + avx:
            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);
                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \
                            format(le2=le2, **fmtspec)
        else:
            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(
                              {in0}.v0, _mm512_set1_ps(1.0f)));
                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(
                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \
                            format(le2=le2, **fmtspec)
        return '''f32 mask[{le}], buf[{le}];
                  int i;
                  {store_mask}
                  {pre}storeu_ps(buf, {in2}.v0);
                  {pre}storeu_ps(buf + {le2}, {in2}.v1);
                  for (i = 0; i < {le}; i++) {{
                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{
                      {in1}[i] = nsimd_f32_to_f16(buf[i]);
                    }}
                  }}'''.format(store_mask=store_mask, le2=le2, **fmtspec)
    suf2 = 'ps' if typ[1:] == '32' else 'pd'
    if simd_ext in sse:
        if typ in common.iutypes:
            return '_mm_maskmoveu_si128({in2}, {in0}, (char *){in1});'. \
                   format(**fmtspec)
        else:
            return '''_mm_maskmoveu_si128(_mm_cast{suf2}_si128({in2}),
                                          _mm_cast{suf2}_si128({in0}),
                                          (char *){in1});'''. \
                                          format(suf2=suf2, **fmtspec)
    if typ in ['i8', 'u8', 'i16', 'u16'] and simd_ext != 'avx512_skylake':
        if simd_ext == 'avx512_knl':
            return \
            '''int i;
               u64 mask;
               {typ} buf[{le}];
               {pre}storeu{sufsi}((__m512i *)buf, {in2});
               mask = (u64){in0};
               for (i = 0; i < {le}; i++) {{
                 if ((mask >> i) & 1) {{
                   {in1}[i] = buf[i];
                 }}
               }}'''.format(utyp='u' + typ[1:], **fmtspec)
        else:
            return \
            '''nsimd_{op_name}_sse42_{typ}({mask_lo}, {in1}, {val_lo});
               nsimd_{op_name}_sse42_{typ}({mask_hi}, {in1} + {le2},
                                           {val_hi});
               '''.format(le2=fmtspec['le'] // 2,
               op_name='mask_store{}1'.format('a' if  aligned else 'u'),
               mask_lo=extract(simd_ext, typ, LO, common.in0),
               mask_hi=extract(simd_ext, typ, HI, common.in0),
               val_lo=extract(simd_ext, typ, LO, common.in2),
               val_hi=extract(simd_ext, typ, HI, common.in2), **fmtspec)
    # Here typ is 32 of 64-bits wide except
    if simd_ext in avx:
        if typ in common.ftypes:
            return '''{pre}maskstore{suf}({in1},
                          {pre}cast{suf2}_si256({in0}), {in2});'''. \
                          format(suf2=suf2, **fmtspec)
        else:
            if simd_ext == 'avx2':
                return '{pre}maskstore{suf}({cast}{in1}, {in0}, {in2});'. \
                       format(cast='(nsimd_longlong *)' \
                              if typ in ['i64', 'u64'] \
                              else '(int *)', **fmtspec)
            else:
                return '''{pre}maskstore_{suf2}(({ftyp}*){in1}, {in0},
                            {pre}castsi256_{suf2}({in2}));'''. \
                            format(suf2=suf2, ftyp='f' + typ[1:], **fmtspec)
    # getting here means avx512 with intrinsics
    code = '{pre}mask_store{{}}{suf}((void*){in1}, {in0}, {in2});'. \
           format(**fmtspec)
    if typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']:
        return code.format('' if aligned else 'u')
    else:
        return code.format('u')

# -----------------------------------------------------------------------------
# Code for binary operators: and, or, xor

def binop2(func, simd_ext, typ, logical=False):
    logical = 'l' if logical else ''
    func = func[0:-1]
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_v{logi}f16 ret;
           ret.v0 = nsimd_{func}{logi2}_{simd_ext}_f32({in0}.v0, {in1}.v0);
           ret.v1 = nsimd_{func}{logi2}_{simd_ext}_f32({in0}.v1, {in1}.v1);
           return ret;'''.format(logi='l' if logical else '', func=func,
                                 logi2='l' if logical else 'b', **fmtspec)
    normal = 'return {pre}{func}{sufsi}({in0}, {in1});'. \
             format(func=func, **fmtspec)
    if simd_ext in sse:
        return normal
    if simd_ext in avx:
        if simd_ext == 'avx2' or typ in ['f32', 'f64']:
            return normal
        else:
            return '''return _mm256_castpd_si256(_mm256_{func}_pd(
                               _mm256_castsi256_pd({in0}),
                                 _mm256_castsi256_pd({in1})));'''. \
                                 format(func=func, **fmtspec)
    if simd_ext in avx512:
        if simd_ext == 'avx512_skylake' or typ in common.iutypes:
            return normal
        else:
            return \
            '''return _mm512_castsi512{suf}(_mm512_{func}_si512(
                        _mm512_cast{typ2}_si512({in0}),
                          _mm512_cast{typ2}_si512({in1})));'''. \
                          format(func=func, typ2=suf_ep(typ)[1:], **fmtspec)

# -----------------------------------------------------------------------------
# Code for logical binary operators: andl, orl, xorl

def binlop2(func, simd_ext, typ):
    op = { 'orl': '|', 'xorl': '^', 'andl': '&' }
    op_fct = { 'orl': 'kor', 'xorl': 'kxor', 'andl': 'kand' }
    if simd_ext not in avx512:
        if typ == 'f16':
            return binop2(func, simd_ext, typ, True)
        else:
            return binop2(func, simd_ext, typ)
    elif simd_ext == 'avx512_knl':
        if typ == 'f16':
            return '''nsimd_{simd_ext}_vlf16 ret;
                      ret.v0 = _{op_fct}_mask16({in0}.v0, {in1}.v0);
                      ret.v1 = _{op_fct}_mask16({in0}.v1, {in1}.v1);
                      return ret;'''. \
                      format(op_fct=op_fct[func], **fmtspec)
        elif typ in ['f32', 'u32', 'i32']:
            return 'return _{op_fct}_mask16({in0}, {in1});'. \
                   format(op_fct=op_fct[func], **fmtspec)
        else:
            return 'return (__mmask{le})({in0} {op} {in1});'. \
                   format(op=op[func], **fmtspec)
    elif simd_ext == 'avx512_skylake':
        if typ == 'f16':
            return '''nsimd_{simd_ext}_vlf16 ret;
                      #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
                        ret.v0 = (__mmask16)({in0}.v0 {op} {in1}.v0);
                        ret.v1 = (__mmask16)({in0}.v1 {op} {in1}.v1);
                      #else
                        ret.v0 = _{op_fct}_mask16({in0}.v0, {in1}.v0);
                        ret.v1 = _{op_fct}_mask16({in0}.v1, {in1}.v1);
                      #endif
                      return ret;'''. \
                      format(op_fct=op_fct[func], op=op[func], **fmtspec)
        else:
            return '''#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
                        return (__mmask{le})({in0} {op} {in1});
                      #else
                        return _{op_fct}_mask{le}({in0}, {in1});
                      #endif'''.format(op_fct=op_fct[func], op=op[func],
                                       **fmtspec)

# -----------------------------------------------------------------------------
# andnot

def andnot2(simd_ext, typ, logical=False):
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_v{logi}f16 ret;
           ret.v0 = nsimd_andnot{logi2}_{simd_ext}_f32({in0}.v0, {in1}.v0);
           ret.v1 = nsimd_andnot{logi2}_{simd_ext}_f32({in0}.v1, {in1}.v1);
           return ret;'''.format(logi='l' if logical else '',
                                 logi2='l' if logical else 'b', **fmtspec)
    if simd_ext in sse:
        return 'return _mm_andnot{sufsi}({in1}, {in0});'.format(**fmtspec)
    if simd_ext in avx:
        if simd_ext == 'avx2' or typ in ['f32', 'f64']:
            return 'return _mm256_andnot{sufsi}({in1}, {in0});'. \
                   format(**fmtspec)
        else:
            return '''return _mm256_castpd_si256(_mm256_andnot_pd(
                               _mm256_castsi256_pd({in1}),
                               _mm256_castsi256_pd({in0})));'''. \
                               format(**fmtspec)
    if simd_ext in avx512:
        if simd_ext == 'avx512_skylake' or typ in common.iutypes:
            return 'return _mm512_andnot{sufsi}({in1}, {in0});'. \
                   format(**fmtspec)
        else:
            return '''return _mm512_castsi512{suf}(_mm512_andnot_si512(
                               _mm512_cast{suf2}_si512({in1}),
                               _mm512_cast{suf2}_si512({in0})));'''. \
                               format(suf2=fmtspec['suf'][1:], **fmtspec)

# -----------------------------------------------------------------------------
# logical andnot

def landnot2(simd_ext, typ):
    if simd_ext in avx512:
        if typ == 'f16':
            return '''nsimd_{simd_ext}_vlf16 ret;
                      ret.v0 = (__mmask16)({in0}.v0 & (~{in1}.v0));
                      ret.v1 = (__mmask16)({in0}.v1 & (~{in1}.v1));
                      return ret;'''.format(**fmtspec)
        else:
            return 'return (__mmask{le})({in0} & (~{in1}));'.format(**fmtspec)
    return andnot2(simd_ext, typ, True)

# -----------------------------------------------------------------------------
# Code for unary not

def not1(simd_ext, typ, logical=False):
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_v{logi}f16 ret;
           nsimd_{simd_ext}_vf32 cte = {pre}castsi{nbits}_ps(
                                         {pre}set1_epi8(-1));
           ret.v0 = nsimd_andnot{logi2}_{simd_ext}_f32(cte, {in0}.v0);
           ret.v1 = nsimd_andnot{logi2}_{simd_ext}_f32(cte, {in0}.v1);
           return ret;'''.format(logi='l' if logical else '',
                                 logi2='l' if logical else 'b', **fmtspec)
    elif typ in ['f32', 'f64']:
        return '''return nsimd_andnotb_{simd_ext}_{typ}(
                           {pre}castsi{nbits}{suf}(
                             {pre}set1_epi8(-1)), {in0});'''.format(**fmtspec)
    else:
        return '''return nsimd_andnotb_{simd_ext}_{typ}(
                           {pre}set1_epi8(-1), {in0});'''.format(**fmtspec)

# -----------------------------------------------------------------------------
# Code for unary logical lnot

def lnot1(simd_ext, typ):
    if simd_ext in avx512:
        if typ == 'f16':
            return '''nsimd_{simd_ext}_vlf16 ret;
                      ret.v0 = (__mmask16)(~{in0}.v0);
                      ret.v1 = (__mmask16)(~{in0}.v1);
                      return ret;'''.format(**fmtspec)
        else:
            return 'return (__mmask{le})(~{in0});'.format(**fmtspec)
    return not1(simd_ext, typ, True)

# -----------------------------------------------------------------------------
# Addition and substraction

def addsub(func, simd_ext, typ):
    if typ in common.ftypes or simd_ext in sse or \
       (simd_ext in avx512 and typ in ['u32', 'i32', 'u64', 'i64']):
        return how_it_should_be_op2(func, simd_ext, typ)
    else:
        if simd_ext in ['avx2', 'avx512_skylake']:
            return how_it_should_be_op2(func, simd_ext, typ)
        else:
            return split_op2(func, simd_ext, typ)

# -----------------------------------------------------------------------------
# Len

def len1(simd_ext, typ):
    return 'return {le};'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Division

def div2(opts, simd_ext, typ):
    if typ in common.ftypes:
        return how_it_should_be_op2('div', simd_ext, typ)
    return emulate_op2(opts, '/', simd_ext, typ)

# -----------------------------------------------------------------------------
# Multiplication

def mul2(opts, simd_ext, typ):
    emulate = emulate_op2(opts, '*', simd_ext, typ)
    split = split_op2('mul', simd_ext, typ)
    # Floats
    if typ in common.ftypes:
        return how_it_should_be_op2('mul', simd_ext, typ)
    # Integers 16, 32 on SSE
    if simd_ext in sse and typ in ['i16', 'u16']:
        return 'return _mm_mullo_epi16({in0}, {in1});'.format(**fmtspec)
    if simd_ext in sse and typ in ['i32', 'u32']:
        if simd_ext == 'sse42':
            return 'return _mm_mullo_epi32({in0}, {in1});'.format(**fmtspec)
        else:
            return emulate
    # Integers 16, 32 on AVX
    if simd_ext in avx and typ in ['i16', 'u16', 'i32', 'u32']:
        if simd_ext == 'avx2':
            return 'return _mm256_mullo{suf}({in0}, {in1});'.format(**fmtspec)
        else:
            return split
    # Integers 64 on SSE on AVX
    if simd_ext in sse + avx and typ in ['i64', 'u64']:
        return emulate_op2(opts, '*', simd_ext, typ)
    # Integers 16 on AVX512
    if simd_ext in avx512 and typ in ['i16', 'u16']:
        if simd_ext == 'avx512_skylake':
            return 'return _mm512_mullo_epi16({in0}, {in1});'.format(**fmtspec)
        else:
            return split
    # Integers 32 on AVX512
    if simd_ext in avx512 and typ in ['i32', 'u32']:
        return 'return _mm512_mullo_epi32({in1}, {in0});'.format(**fmtspec)
    # Integers 64 on AVX512
    if simd_ext in avx512 and typ in ['i64', 'u64']:
        if simd_ext == 'avx512_skylake':
            return 'return _mm512_mullo_epi64({in0}, {in1});'.format(**fmtspec)
        else:
            return emulate
    # Integers 8 on SSE
    with_epi16 = '''nsimd_{simd_ext}_v{typ} lo =
                        {pre}mullo_epi16({in0}, {in1});
                    nsimd_{simd_ext}_v{typ} hi = {pre}slli_epi16(
                        {pre}mullo_epi16({pre}srli_epi16({in0}, 8),
                          {pre}srli_epi16({in1}, 8)), 8);
                    return {pre}or{sufsi}({pre}and{sufsi}(
                              lo, {pre}set1_epi16(255)),hi);'''. \
                    format(**fmtspec)
    split_epi16 = split_op2('mul', simd_ext, typ)
    if simd_ext in sse and typ in ['i8', 'u8']:
        return with_epi16
    if simd_ext in avx + avx512 and typ in ['i8', 'u8']:
        if simd_ext in ['avx2', 'avx512_skylake']:
            return with_epi16
        else:
            return split_epi16

# -----------------------------------------------------------------------------
# Shift left and right

def shl_shr(func, simd_ext, typ):
    if typ in ['f16', 'f32', 'f64']:
        return ''
    intrinsic = 'srl' if func == 'shr' else 'sll'
    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'
    split = '''nsimd_{simd_ext2}_v{typ} v0 = {extract_lo};
               nsimd_{simd_ext2}_v{typ} v1 = {extract_hi};
               v0 = nsimd_{func}_{simd_ext2}_{typ}(v0, {in1});
               v1 = nsimd_{func}_{simd_ext2}_{typ}(v1, {in1});
               return {merge};'''. \
               format(simd_ext2=simd_ext2, func=func,
                      extract_lo=extract(simd_ext, typ, LO, common.in0),
                      extract_hi=extract(simd_ext, typ, HI, common.in0),
                      merge=setr(simd_ext, typ, 'v0', 'v1'), **fmtspec)
    normal_16_32_64 = '''return {pre}{intrinsic}{suf}(
                           {in0}, _mm_set1_epi64x({in1}));'''. \
                      format(intrinsic=intrinsic, **fmtspec)
    FFs = '0x' + ('F' * int((int(typ[1:]) // 4)))
    FFOOs = FFs  + ('0' * int((int(typ[1:]) // 4)))
    with_2n_for_n = '''nsimd_{simd_ext}_v{typ} lo = {pre}and{sufsi}(
                         {pre}{intrinsic}_epi{typ2nbits}(
                           {in0}, _mm_set1_epi64x({in1})),
                             nsimd_set1_{simd_ext}_u{typ2nbits}({masklo}));
                       nsimd_{simd_ext}_v{typ} hi =
                         {pre}{intrinsic}_epi{typ2nbits}({pre}and{sufsi}({in0},
                           nsimd_set1_{simd_ext}_u{typ2nbits}({maskhi})),
                             _mm_set1_epi64x({in1}));
                       return {pre}or{sufsi}(hi, lo);'''. \
                       format(intrinsic=intrinsic, typ2nbits=2 * int(typ[1:]),
                              masklo=FFs if func == 'shl' else FFOOs,
                              maskhi=FFOOs if func == 'shl' else FFs, **fmtspec)
    with_32_for_8 = '''nsimd_{simd_ext}_v{typ} masklo =
                         nsimd_set1_{simd_ext}_u32(0xFF00FF);
                       nsimd_{simd_ext}_v{typ} lo =
                         {pre}and{sufsi}({pre}{intrinsic}_epi32(
                           {pre}and{sufsi}({in0}, masklo),
                             _mm_set1_epi64x({in1})), masklo);
                       nsimd_{simd_ext}_v{typ} maskhi =
                         nsimd_set1_{simd_ext}_u32(0xFF00FF00);
                       nsimd_{simd_ext}_v{typ} hi =
                           {pre}and{sufsi}({pre}{intrinsic}_epi32(
                             {pre}and{sufsi}({in0}, maskhi),
                               _mm_set1_epi64x({in1})), maskhi);
                       return {pre}or{sufsi}(hi, lo);'''. \
                       format(intrinsic=intrinsic, **fmtspec)
    if simd_ext in sse:
        if typ in ['i8', 'u8']:
            return with_2n_for_n
        if typ in ['i16', 'u16', 'i32', 'u32', 'i64', 'u64']:
            return normal_16_32_64
    if simd_ext in avx:
        if typ in ['i8', 'u8']:
            return with_2n_for_n if simd_ext == 'avx2' else split
        if typ in ['i16', 'u16', 'i32', 'u32', 'i64', 'u64']:
            return normal_16_32_64 if simd_ext == 'avx2' else split
    if simd_ext in avx512:
        if typ in ['i8', 'u8']:
            return with_2n_for_n if simd_ext == 'avx512_skylake' \
                                 else with_32_for_8
        if typ in ['i16', 'u16']:
            return normal_16_32_64 if simd_ext == 'avx512_skylake' \
                                   else with_2n_for_n
        if typ in ['i32', 'u32', 'i64', 'u64']:
            return normal_16_32_64

# -----------------------------------------------------------------------------
# Arithmetic shift right

def shra(opts, simd_ext, typ):
    if typ in common.utypes:
        # For unsigned type, logical shift
        return 'return nsimd_shr_{simd_ext}_{typ}({in0}, {in1});'. \
               format(**fmtspec)

    intrinsic = 'return {pre}sra{suf}({in0}, _mm_set1_epi64x((i64){in1}));'. \
                format(**fmtspec)

    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'
    split = '''nsimd_{simd_ext2}_v{typ} v0 = {extract_lo};
               nsimd_{simd_ext2}_v{typ} v1 = {extract_hi};
               v0 = nsimd_shra_{simd_ext2}_{typ}(v0, {in1});
               v1 = nsimd_shra_{simd_ext2}_{typ}(v1, {in1});
               return {merge};'''. \
               format(simd_ext2=simd_ext2,
                      extract_lo=extract(simd_ext, typ, LO, common.in0),
                      extract_hi=extract(simd_ext, typ, HI, common.in0),
                      merge=setr(simd_ext, typ, 'v0', 'v1'), **fmtspec)

    trick_for_i8 = \
    '''__m128i count = _mm_set1_epi64x((i64){in1});
       nsimd_{simd_ext}_vi16 lo, hi;
       hi = {pre}andnot{sufsi}({pre}set1_epi16(255),
                               {pre}sra_epi16({in0}, count));
       lo = {pre}srli_epi16({pre}sra_epi16(
                {pre}slli_epi16({in0}, 8), count), 8);
       return {pre}or{sufsi}(hi, lo);'''.format(**fmtspec)

    emulation = get_emulation_code('shra', ['v', 's'], simd_ext, typ)

    if simd_ext in sse + ['avx2']:
        if typ == 'i8':
            return trick_for_i8
        elif typ in ['i16', 'i32']:
            return intrinsic
        elif typ == 'i64':
            return emulation
    elif simd_ext == 'avx':
        if typ in ['i8', 'i16', 'i32']:
            return split
        elif typ == 'i64':
            return emulation
    elif simd_ext == 'avx512_knl':
        if typ in ['i8', 'i16']:
            return split
        elif typ in ['i32', 'i64']:
            return intrinsic
    elif simd_ext == 'avx512_skylake':
        if typ == 'i8':
            return trick_for_i8
        elif typ in ['i16', 'i32', 'i64']:
            return intrinsic

# -----------------------------------------------------------------------------
# set1 or splat function

def set1(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  f32 f = nsimd_f16_to_f32({in0});
                  ret.v0 = {pre}set1_ps(f);
                  ret.v1 = {pre}set1_ps(f);
                  return ret;'''.format(**fmtspec)
    if simd_ext in sse + avx:
        if typ == 'i64':
            return 'return {pre}set1_epi64x({in0});'.format(**fmtspec)
        if typ == 'u64':
            return '''union {{ u64 u; i64 i; }} buf;
                      buf.u = {in0};
                      return {pre}set1_epi64x(buf.i);'''.format(**fmtspec)
    if typ in ['u8', 'u16', 'u32', 'u64']:
        return '''union {{ {typ} u; i{typnbits} i; }} buf;
                  buf.u = {in0};
                  return {pre}set1{suf}(buf.i);'''.format(**fmtspec)
    return 'return {pre}set1{suf}({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# set1l or splat function for logical

def set1l(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vlf16 ret;
                  ret.v0 = nsimd_set1l_{simd_ext}_f32({in0});
                  ret.v1 = ret.v0;
                  return ret;'''.format(**fmtspec)
    if simd_ext in sse + avx:
        if simd_ext in sse:
            ones = '_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd())'
        else:
            ones = '_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), ' \
                   '_CMP_EQ_OQ)'
        if typ != 'f64':
            ones = '{pre}castpd{sufsi}({ones})'.format(ones=ones, **fmtspec)
        return '''if ({in0}) {{
                    return {ones};
                  }} else {{
                    return {pre}setzero{sufsi}();
                  }}'''.format(ones=ones, **fmtspec)
    else:
        return '''if ({in0}) {{
                    return (__mmask{le})(~(__mmask{le})(0));
                  }} else {{
                    return (__mmask{le})(0);
                  }}'''.format(**fmtspec)

# -----------------------------------------------------------------------------
# Equality

def eq2(simd_ext, typ):
    if typ == 'f16':
        return f16_cmp2('eq', simd_ext)
    if simd_ext in sse:
        if typ in ['i64', 'u64']:
            if simd_ext == 'sse42':
                return how_it_should_be_op2('cmpeq', simd_ext, typ)
            else:
                return \
                '''__m128i t = _mm_cmpeq_epi32({in0}, {in1});
                   return _mm_and_si128(t,
                            _mm_shuffle_epi32(t, 177) /* = 2|3|0|1 */);'''. \
                            format(**fmtspec)
        else:
            return how_it_should_be_op2('cmpeq', simd_ext, typ)
    if simd_ext in avx:
        if typ in ['f32', 'f64']:
            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_EQ_OQ);'. \
                   format(**fmtspec)
        else:
            if simd_ext == 'avx2':
                return how_it_should_be_op2('cmpeq', simd_ext, typ)
            else:
                return split_cmp2('eq', simd_ext, typ)
    if simd_ext in avx512:
        if typ in ['f32', 'f64']:
            return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_EQ_OQ);'. \
                   format(**fmtspec)
        elif typ in ['i32', 'u32', 'i64', 'u64']:
            return \
            'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_EQ);'. \
            format(**fmtspec)
        else:
            if simd_ext == 'avx512_skylake':
                return \
                'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_EQ);'. \
                format(**fmtspec)
            else:
                return split_cmp2('eq', simd_ext, typ)

# -----------------------------------------------------------------------------
# not equal

def neq2(simd_ext, typ):
    if typ == 'f16':
        return f16_cmp2('ne', simd_ext)
    if simd_ext in sse and typ in ['f32', 'f64']:
        return how_it_should_be_op2('cmpneq', simd_ext, typ)
    if simd_ext in avx and typ in ['f32', 'f64']:
        return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_NEQ_UQ);'. \
               format(**fmtspec)
    if simd_ext in avx512 and typ in ['f32', 'f64']:
        return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_NEQ_UQ);'. \
               format(**fmtspec)
    noteq = '''return nsimd_notl_{simd_ext}_{typ}(
                        nsimd_eq_{simd_ext}_{typ}({in0}, {in1}));'''. \
                        format(**fmtspec)
    if simd_ext in avx512:
        intrinsic = \
            'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_NE);'. \
            format(**fmtspec)
        if typ in ['i32', 'u32', 'i64', 'u64']:
            return intrinsic
        else:
            return intrinsic if  simd_ext == 'avx512_skylake' else noteq
    return noteq

# -----------------------------------------------------------------------------
# Greater than

def gt2(simd_ext, typ):
    if typ == 'f16':
        return f16_cmp2('gt', simd_ext)
    if simd_ext in sse:
        if typ in ['f32', 'f64', 'i8', 'i16', 'i32']:
            return how_it_should_be_op2('cmpgt', simd_ext, typ)
        if typ == 'i64':
            if simd_ext == 'sse42':
                return how_it_should_be_op2('cmpgt', simd_ext, typ)
            #return '''return _mm_sub_epi64(_mm_setzero_si128(), _mm_srli_epi64(
            #                   _mm_sub_epi64({in1}, {in0}), 63));'''. \
            #                   format(**fmtspec)
            return '''{typ} buf0[2], buf1[2];
                      _mm_storeu_si128((__m128i*)buf0, {in0});
                      _mm_storeu_si128((__m128i*)buf1, {in1});
                      buf0[0] = -(buf0[0] > buf1[0]);
                      buf0[1] = -(buf0[1] > buf1[1]);
                      return _mm_loadu_si128((__m128i*)buf0);'''. \
                      format(**fmtspec)
        return cmp2_with_add('gt', simd_ext, typ)
    if simd_ext in avx:
        if typ in ['f32', 'f64']:
            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_GT_OQ);'. \
                   format(**fmtspec)
        if typ in ['i8', 'i16', 'i32', 'i64']:
            if simd_ext == 'avx2':
                return how_it_should_be_op2('cmpgt', simd_ext, typ)
            else:
                return split_cmp2('gt', simd_ext, typ)
        if simd_ext == 'avx2':
            return cmp2_with_add('gt', simd_ext, typ)
        else:
            return split_cmp2('gt', simd_ext, typ)
    # AVX512
    if typ in ['f32', 'f64', 'i32', 'i64']:
        return \
        'return _mm512_cmp{suf}_mask({in0}, {in1}, {cte});'. \
        format(cte='_CMP_GT_OQ' if typ in ['f32', 'f64'] else '_MM_CMPINT_NLE',
               **fmtspec)
    if typ in ['u32', 'u64']:
        return \
        'return _mm512_cmp_epu{typ2}_mask({in0}, {in1}, _MM_CMPINT_NLE);'. \
        format(typ2=typ[1:], **fmtspec)
    if simd_ext == 'avx512_skylake':
        return \
        'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLE);'. \
        format(**fmtspec)
    else:
        return split_cmp2('gt', simd_ext, typ)

# -----------------------------------------------------------------------------
# lesser than

def lt2(simd_ext, typ):
    return 'return nsimd_gt_{simd_ext}_{typ}({in1}, {in0});'. \
           format(**fmtspec)

# -----------------------------------------------------------------------------
# greater or equal

def geq2(simd_ext, typ):
    if typ == 'f16':
        return f16_cmp2('ge', simd_ext)
    notlt = '''return nsimd_notl_{simd_ext}_{typ}(
                        nsimd_lt_{simd_ext}_{typ}({in0}, {in1}));'''. \
            format(**fmtspec)
    if simd_ext in sse:
        if typ in ['f32', 'f64']:
            return how_it_should_be_op2('cmpge', simd_ext, typ)
    if simd_ext in avx:
        if typ in ['f32', 'f64']:
            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_GE_OQ);'. \
                   format(**fmtspec)
    if simd_ext in avx512:
        if typ in ['i32', 'i64', 'u32', 'u64']:
            return \
              'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLT);'. \
              format(**fmtspec)
        if typ in ['f32', 'f64']:
            return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_GE_OQ);'. \
                   format(**fmtspec)
        if simd_ext == 'avx512_skylake':
            return \
            'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLT);'. \
            format(**fmtspec)
        else:
            return notlt
    return notlt

# -----------------------------------------------------------------------------
# lesser or equal

def leq2(simd_ext, typ):
    if typ == 'f16':
        return f16_cmp2('le', simd_ext)
    notgt = '''return nsimd_notl_{simd_ext}_{typ}(
                        nsimd_gt_{simd_ext}_{typ}({in0}, {in1}));'''. \
                        format(**fmtspec)
    if simd_ext in sse and typ in ['f32', 'f64']:
        return 'return _mm_cmple{suf}({in0}, {in1});'.format(**fmtspec)
    if simd_ext in avx and typ in ['f32', 'f64']:
            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_LE_OQ);'. \
                   format(**fmtspec)
    if simd_ext in avx512:
        if typ in ['i32', 'i64', 'u32', 'u64']:
            return \
              'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_LE);'. \
              format(**fmtspec)
        if typ in ['f32', 'f64']:
            return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_LE_OQ);'. \
                   format(**fmtspec)
        if simd_ext == 'avx512_skylake':
            return \
            'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_LE);'. \
            format(**fmtspec)
        else:
            return notgt
    return notgt

# -----------------------------------------------------------------------------
# if_else1 function

def if_else1(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_if_else1_{simd_ext}_f32(
                             {in0}.v0, {in1}.v0, {in2}.v0);
                  ret.v1 = nsimd_if_else1_{simd_ext}_f32(
                             {in0}.v1, {in1}.v1, {in2}.v1);
                  return ret;'''.format(**fmtspec)
    manual = '''return nsimd_orb_{simd_ext}_{typ}(
                         nsimd_andb_{simd_ext}_{typ}({in1}, {in0}),
                         nsimd_andnotb_{simd_ext}_{typ}({in2}, {in0}));'''. \
                         format(**fmtspec)
    if simd_ext in sse:
        if simd_ext == 'sse42':
            return 'return _mm_blendv{fsuf}({in2}, {in1}, {in0});'. \
                   format(fsuf=suf_ep(typ) if typ in ['f32', 'f64']
                          else '_epi8', **fmtspec)
        else:
            return manual
    if simd_ext in avx:
        if typ in ['f32', 'f64']:
            return 'return _mm256_blendv{suf}({in2}, {in1}, {in0});'. \
                   format(**fmtspec)
        else:
            if simd_ext == 'avx2':
                return 'return _mm256_blendv_epi8({in2}, {in1}, {in0});'. \
                       format(**fmtspec)
            else:
                return manual
    if simd_ext in avx512:
        if typ in ['f32', 'f64', 'i32', 'u32', 'i64', 'u64']:
            return 'return _mm512_mask_blend{suf}({in0}, {in2}, {in1});'. \
                   format(**fmtspec)
        else:
            if simd_ext == 'avx512_skylake':
                return 'return _mm512_mask_blend{suf}({in0}, {in2}, {in1});'. \
                       format(**fmtspec)
            else:
                return '''int i;
                          {typ} buf0[{le}], buf1[{le}];
                          _mm512_storeu_si512(buf0, {in1});
                          _mm512_storeu_si512(buf1, {in2});
                          for (i = 0; i < {le}; i++) {{
                            if ((({in0} >> i) & 1) == 0) {{
                              buf0[i] = buf1[i];
                            }}
                          }}
                          return _mm512_loadu_si512(buf0);'''.format(**fmtspec)

# -----------------------------------------------------------------------------
# min and max functions

def minmax(func, simd_ext, typ):
    if typ in ['f16', 'f32', 'f64']:
        return how_it_should_be_op2(func, simd_ext, typ)
    with_if_else = '''return nsimd_if_else1_{simd_ext}_{typ}(
                               nsimd_gt_{simd_ext}_{typ}(
                                 {args}), {in0}, {in1});'''. \
                   format(args = '{in0}, {in1}'.format(**fmtspec)
                            if func == 'max'
                            else '{in1}, {in0}'.format(**fmtspec), **fmtspec)
    if simd_ext in sse:
        if typ in ['u8', 'i16']:
            return 'return _mm_{func}_ep{typ}({in0}, {in1});'. \
                   format(func=func, **fmtspec)
        if typ in ['i8', 'u16', 'i32', 'u32']:
            if simd_ext == 'sse42':
                return 'return _mm_{func}_ep{typ}({in0}, {in1});'. \
                       format(func=func, **fmtspec)
            else:
                return with_if_else
    if simd_ext in avx and typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32']:
        if simd_ext == 'avx2':
            return 'return _mm256_{func}_ep{typ}({in0}, {in1});'. \
                   format(func=func, **fmtspec)
        else:
            return split_op2(func, simd_ext, typ)
    if simd_ext in avx512:
        if typ in ['i32', 'u32', 'i64', 'u64']:
            return 'return _mm512_{func}_ep{typ}({in0}, {in1});'. \
                   format(func=func, **fmtspec)
        else:
            if simd_ext == 'avx512_skylake':
                return 'return _mm512_{func}_ep{typ}({in0}, {in1});'. \
                       format(func=func, **fmtspec)
            else:
                return split_op2(func, simd_ext, typ)
    return with_if_else

# -----------------------------------------------------------------------------
# sqrt

def sqrt1(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = {pre}sqrt_ps({in0}.v0);
                  ret.v1 = {pre}sqrt_ps({in0}.v1);
                  return ret;'''.format(**fmtspec)
    return 'return {pre}sqrt{suf}({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Load logical

def loadl(simd_ext, typ, aligned):
    if simd_ext in avx512:
        if typ == 'f16':
            return '''/* This can surely be improved but it is not our
                         priority. Note that we take advantage of the fact that
                         floating zero is represented as integer zero to
                         simplify code. */
                      nsimd_{simd_ext}_vlf16 ret;
                      __mmask32 tmp = nsimd_loadlu_{simd_ext}_u16((u16*){in0});
                      ret.v0 = (__mmask16)(tmp & 0xFFFF);
                      ret.v1 = (__mmask16)((tmp >> 16) & 0xFFFF);
                      return ret;'''.format(**fmtspec)
        return '''/* This can surely be improved but it is not our priority. */
                  int i;
                  __mmask{le} ret = 0;
                  for (i = 0; i < {le}; i++) {{
                    if ({in0}[i] != ({typ})0) {{
                      ret |= (__mmask{le})((__mmask{le})1 << i);
                    }}
                  }}
                  return ret;'''.format(**fmtspec)
    return \
    '''/* This can surely be improved but it is not our priority. */
       return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}(
                nsimd_load{align}_{simd_ext}_{typ}(
                  {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \
       format(align='a' if aligned else 'u',
              zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'
              else '({})0'.format(typ), **fmtspec)

# -----------------------------------------------------------------------------
# Store logical

def storel(simd_ext, typ, aligned):
    if simd_ext in avx512:
        if typ == 'f16':
            return '''/* This can surely be improved but it is not our
                         priority. Note that we take advantage of the fact that
                         floating zero is represented as integer zero to
                         simplify code. */
                      int i;
                      u16 one = 0x3C00; /* FP16 IEEE754 representation of 1 */
                      for (i = 0; i < 16; i++) {{
                        ((u16*){in0})[i] = (u16)((({in1}.v0 >> i) & 1) ? one
                                                                       : 0);
                      }}
                      for (i = 0; i < 16; i++) {{
                        ((u16*){in0})[i + 16] = (u16)((({in1}.v1 >> i) & 1)
                                                      ? one : 0);
                      }}'''.format(**fmtspec)
        return '''/* This can surely be improved but it is not our priority. */
                  int i;
                  for (i = 0; i < {le}; i++) {{
                    {in0}[i] = ({typ})((({in1} >> i) & 1) ? 1 : 0);
                  }}'''.format(**fmtspec)
    return \
    '''/* This can surely be improved but it is not our priority. */
       nsimd_store{align}_{simd_ext}_{typ}({in0},
         nsimd_if_else1_{simd_ext}_{typ}({in1},
           nsimd_set1_{simd_ext}_{typ}({one}),
           nsimd_set1_{simd_ext}_{typ}({zero})));'''. \
           format(align = 'a' if aligned else 'u',
                  one = 'nsimd_f32_to_f16(1.0f)' if typ == 'f16'
                  else '({})1'.format(typ),
                  zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'
                  else '({})0'.format(typ), **fmtspec)

# -----------------------------------------------------------------------------
# Absolute value

def abs1(simd_ext, typ):
    def mask(typ):
        return '0x7F' + ('F' * int(((int(typ[1:]) - 8) // 4)))
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16 ret;
           nsimd_{simd_ext}_vf32 mask = {pre}castsi{nbits}_ps(
                                          nsimd_set1_{simd_ext}_u32({mask}));
           ret.v0 = nsimd_andb_{simd_ext}_f32({in0}.v0, mask);
           ret.v1 = nsimd_andb_{simd_ext}_f32({in0}.v1, mask);
           return ret;'''.format(mask=mask('f32'), **fmtspec)
    if typ in ['u8', 'u16', 'u32', 'u64']:
        return 'return {in0};'.format(**fmtspec)
    if typ in ['f32', 'f64']:
        return \
        '''nsimd_{simd_ext}_v{typ} mask = {pre}castsi{nbits}{suf}(
               nsimd_set1_{simd_ext}_u{typnbits}({mask}));
           return nsimd_andb_{simd_ext}_{typ}({in0}, mask);'''. \
           format(mask=mask(typ), **fmtspec)
    bit_twiddling_arith_shift = \
    '''nsimd_{simd_ext}_v{typ} mask = {pre}srai{suf}({in0}, {typnbitsm1});
       return {pre}xor{sufsi}({pre}add{suf}({in0}, mask), mask);'''. \
       format(typnbitsm1=int(typ[1:]) - 1, **fmtspec)
    bit_twiddling_no_arith_shift = \
    '''nsimd_{simd_ext}_v{typ} mask = {pre}sub{suf}({pre}setzero{sufsi}(),
                                        nsimd_shr_{simd_ext}_{typ}(
                                          {in0}, {typnbitsm1}));
       return {pre}xor{sufsi}({pre}add{suf}({in0}, mask), mask);'''. \
       format(typnbitsm1=int(typ[1:]) - 1, **fmtspec)
    with_blendv = \
    '''return _mm256_castpd_si256(_mm256_blendv_pd(
        _mm256_castsi256_pd({in0}),
        _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), {in0})),
        _mm256_castsi256_pd({in0})));'''.format(**fmtspec)
    if simd_ext in sse:
        if typ in ['i16', 'i32']:
            if simd_ext == 'sse42':
                return 'return _mm_abs{suf}({in0});'.format(**fmtspec)
            else:
                return bit_twiddling_arith_shift
        if typ == 'i8':
            if simd_ext == 'sse42':
                return 'return _mm_abs{suf}({in0});'.format(**fmtspec)
            else:
                return bit_twiddling_no_arith_shift
        if typ == 'i64':
            return bit_twiddling_no_arith_shift
    if simd_ext in avx:
        if typ in ['i8', 'i16', 'i32']:
            if simd_ext == 'avx2':
                return 'return _mm256_abs{suf}({in0});'.format(**fmtspec)
            else:
                return split_opn('abs', simd_ext, typ, 1)
        else:
            if simd_ext == 'avx2':
                return with_blendv
            else:
                return split_opn('abs', simd_ext, typ, 1)
    if simd_ext in avx512:
        if typ in ['i32', 'i64']:
            return 'return _mm512_abs{suf}({in0});'.format(**fmtspec)
        else:
            if simd_ext == 'avx512_skylake':
                return 'return _mm512_abs{suf}({in0});'.format(**fmtspec)
            else:
                return split_opn('abs', simd_ext, typ, 1)

# -----------------------------------------------------------------------------
# FMA and FMS

def fma_fms(func, simd_ext, typ):
    op = 'add' if func in ['fma', 'fnma'] else 'sub'
    neg = 'n' if func in ['fnma', 'fnms'] else ''
    if typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16 ret;
           ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in1}.v0, {in2}.v0);
           ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1, {in1}.v1, {in2}.v1);
           return ret;'''.format(neg=neg, func=func, **fmtspec)
    if neg == '':
        emulate = '''return nsimd_{op}_{simd_ext}_{typ}(
                              nsimd_mul_{simd_ext}_{typ}({in0}, {in1}),
                                {in2});'''.format(op=op, **fmtspec)
    else:
        emulate = '''return nsimd_{op}_{simd_ext}_{typ}(
                              nsimd_mul_{simd_ext}_{typ}(
                                nsimd_neg_{simd_ext}_{typ}({in0}), {in1}),
                                    {in2});'''.format(op=op, **fmtspec)
    # One could use only emulate and no split. But to avoid splitting and
    # merging SIMD register for each operation: sub, mul and add, we use
    # emulation only for SIMD extensions that have natively add, sub and mul
    # intrinsics.
    split = split_opn(func, simd_ext, typ, 3)
    if typ in ['f32', 'f64']:
        if simd_ext in sse + avx:
            return '''#ifdef NSIMD_FMA
                        return {pre}f{neg}m{op}{suf}({in0}, {in1}, {in2});
                      # else
                        {emulate}
                      # endif'''.format(op=op, neg=neg, emulate=emulate,
                                       **fmtspec)
        else:
            return 'return {pre}f{neg}m{op}{suf}({in0}, {in1}, {in2});'. \
                   format(op=op, neg=neg, **fmtspec)
    if simd_ext in avx:
        return emulate if simd_ext == 'avx2' else split
    if simd_ext in avx512:
        return emulate if simd_ext == 'avx512_skylake' else split
    return emulate

# -----------------------------------------------------------------------------
# Ceil and floor

def round1(opts, func, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1);
                  return ret;'''.format(func=func, **fmtspec)
    if typ in ['f32', 'f64']:
        normal = 'return {pre}{func}{suf}({in0});'.format(func=func, **fmtspec)
        if simd_ext not in sse:
            return normal
        if simd_ext == 'sse42':
            return normal
        else:
            return emulate_op1(opts, func, simd_ext, typ)
    return 'return {in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Trunc

def trunc1(opts, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_trunc_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_trunc_{simd_ext}_f32({in0}.v1);
                  return ret;'''.format(**fmtspec)
    if typ in ['f32', 'f64']:
        normal = '''return {pre}round{suf}({in0}, _MM_FROUND_TO_ZERO |
                               _MM_FROUND_NO_EXC);'''.format(**fmtspec)
        if simd_ext == 'sse2':
            return emulate_op1(opts, 'trunc', simd_ext, typ)
        if simd_ext == 'sse42':
            return normal
        if simd_ext in avx:
            return normal
        if simd_ext in avx512:
            return \
            '''__mmask{le} cond = nsimd_gt_{simd_ext}_{typ}(
                                    {in0}, _mm512_setzero{sufsi}());
               return nsimd_if_else1_{simd_ext}_{typ}(cond,
                        nsimd_floor_{simd_ext}_{typ}({in0}),
                          nsimd_ceil_{simd_ext}_{typ}({in0}));'''. \
                          format(**fmtspec)
    return 'return {in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Round to even

def round_to_even1(opts, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_round_to_even_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_round_to_even_{simd_ext}_f32({in0}.v1);
                  return ret;'''.format(**fmtspec)
    if typ in ['f32', 'f64']:
        normal = '''return {pre}round{suf}({in0}, _MM_FROUND_TO_NEAREST_INT |
                               _MM_FROUND_NO_EXC);'''.format(**fmtspec)
        if simd_ext == 'sse2':
            return emulate_op1(opts, 'round_to_even', simd_ext, typ)
        if simd_ext == 'sse42':
            return normal
        if simd_ext in avx:
            return normal
        if simd_ext in avx512:
            return 'return _mm512_roundscale{suf}({in0}, 0);'.format(**fmtspec)
    return 'return {in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------
# All and any functions

def all_any(func, simd_ext, typ):
    if typ == 'f16':
        return \
        '''return nsimd_{func}_{simd_ext}_f32({in0}.v0) {and_or}
                  nsimd_{func}_{simd_ext}_f32({in0}.v1);'''. \
                  format(func=func, and_or='&&' if func == 'all' else '||',
                         **fmtspec)
    if simd_ext in sse:
        if typ in common.iutypes:
            return 'return (u32)_mm_movemask_epi8({in0}) {test};'. \
                   format(test='== 0xFFFF' if func == 'all' else '!= 0u',
                          **fmtspec)
        else:
            mask = '0xF' if typ == 'f32' else '0x3'
            return 'return (u32)_mm_movemask{suf}({in0}) {test};'. \
                   format(test='== ' + mask if func == 'all' else '!= 0u',
                          **fmtspec)
    if simd_ext in avx:
        if typ in common.iutypes:
            if simd_ext == 'avx2':
                return 'return _mm256_movemask_epi8({in0}) {test};'. \
                       format(test='== -1' if func == 'all' else '!= 0',
                              **fmtspec)
            else:
                return \
                '''nsimd_sse42_v{typ} lo = {extract_lo};
                   nsimd_sse42_v{typ} hi = {extract_hi};
                   return nsimd_{func}_sse42_{typ}(lo) {and_or}
                          nsimd_{func}_sse42_{typ}(hi);'''. \
                   format(extract_lo=extract(simd_ext, typ, LO, common.in0),
                          extract_hi=extract(simd_ext, typ, HI, common.in0),
                          func=func, and_or='&&' if func == 'all' else '||',
                          **fmtspec)
        else:
            mask = '0xFF' if typ == 'f32' else '0xF'
            return 'return _mm256_movemask{suf}({in0}) {test};'. \
                   format(test='== ' + mask if func == 'all' else '!= 0',
                          **fmtspec)
    if simd_ext in avx512:
        all_test = '== 0x' + ('F' * int((512 // int(typ[1:]) // 4)))
        return 'return {in0} {test};'. \
               format(test=all_test if func == 'all' else '!= 0', **fmtspec)

# -----------------------------------------------------------------------------
# Reinterpret (bitwise_cast)

def reinterpret1(simd_ext, from_typ, to_typ):
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    if to_typ == 'f16':
        emulate = '''{from_typ} buf[{le}];
                     nsimd_storeu_{simd_ext}_{from_typ}(buf, {in0});
                     return nsimd_loadu_{simd_ext}_f16((f16*)buf);'''. \
                     format(**fmtspec)
        native = '''nsimd_{simd_ext}_vf16 ret;
                    ret.v0 = {pre}cvtph_ps({extract_lo});
                    ret.v1 = {pre}cvtph_ps({extract_hi});
                    return ret;'''.format(
                    extract_lo=extract(simd_ext, 'u16', LO, common.in0),
                    extract_hi=extract(simd_ext, 'u16', HI, common.in0),
                    **fmtspec)
        if simd_ext in sse:
            return \
            '''#ifdef NSIMD_FP16
                 nsimd_{simd_ext}_vf16 ret;
                 ret.v0 = _mm_cvtph_ps({in0});
                 {in0} = _mm_shuffle_epi32({in0}, 14); /* = (3 << 2) | (2 << 0) */
                 ret.v1 = _mm_cvtph_ps({in0});
                 return ret;
               #else
                 {emulate}
               #endif'''.format(emulate=emulate, **fmtspec)
        if simd_ext in avx:
            return \
            '''#ifdef NSIMD_FP16
                 {}
               #else
                 {}
               #endif'''.format(native, emulate)
        if simd_ext in avx512:
            return native
    if from_typ == 'f16':
        emulate = \
        '''u16 buf[{le}];
           nsimd_storeu_{simd_ext}_f16((f16*)buf, {in0});
           return nsimd_loadu_{simd_ext}_{to_typ}(({to_typ}*)buf);'''. \
           format(**fmtspec)
        native = 'return {};'.format(setr(simd_ext, 'u16',
                 '{pre}cvtps_ph({in0}.v0, 4)'.format(**fmtspec),
                 '{pre}cvtps_ph({in0}.v1, 4)'.format(**fmtspec)))
        if simd_ext in sse:
            return \
            '''#ifdef NSIMD_FP16
                 __m128i lo = _mm_cvtps_ph({in0}.v0, 4);
                 __m128i hi = _mm_cvtps_ph({in0}.v1, 4);
                 return _mm_castpd_si128(_mm_shuffle_pd(
                          _mm_castsi128_pd(lo), _mm_castsi128_pd(hi), 0));
               #else
                 {emulate}
               #endif'''.format(emulate=emulate, **fmtspec)
        if simd_ext in avx:
            return \
            '''#ifdef NSIMD_FP16
                 {}
               #else
                 {}
               #endif'''.format(native, emulate)
        if simd_ext in avx512:
            return native
    if from_typ in common.iutypes and to_typ in common.iutypes:
        return 'return {in0};'.format(**fmtspec)
    if to_typ in ['f32', 'f64']:
        return 'return {pre}castsi{nbits}{to_suf}({in0});'. \
               format(to_suf=suf_ep(to_typ), **fmtspec)
    if from_typ in ['f32', 'f64']:
        return 'return {pre}cast{from_suf}_si{nbits}({in0});'. \
               format(from_suf=suf_ep(from_typ)[1:], **fmtspec)

# -----------------------------------------------------------------------------
# Reinterpretl, i.e. reinterpret on logicals

def reinterpretl1(simd_ext, from_typ, to_typ):
    if from_typ == to_typ:
        return 'return {in0};'.format(**fmtspec)
    if to_typ == 'f16':
        if simd_ext in sse:
            return \
            '''nsimd_{simd_ext}_vlf16 ret;
               ret.v0 = _mm_castsi128_ps(_mm_unpacklo_epi16({in0}, {in0}));
               ret.v1 = _mm_castsi128_ps(_mm_unpackhi_epi16({in0}, {in0}));
               return ret;'''.format(**fmtspec)
        if simd_ext == 'avx':
            return \
            '''nsimd_{simd_ext}_vlf16 ret;
               nsimd_sse42_vlf16 tmp1 =
                   nsimd_reinterpretl_sse42_f16_{from_typ}(
                     _mm256_castsi256_si128({in0}));
               nsimd_sse42_vlf16 tmp2 =
                   nsimd_reinterpretl_sse42_f16_{from_typ}(
                      _mm256_extractf128_si256({in0}, 1));
               ret.v0 = {setr_tmp1};
               ret.v1 = {setr_tmp2};
               return ret;'''. \
               format(setr_tmp1=setr('avx', 'f32', 'tmp1.v0', 'tmp1.v1'),
                      setr_tmp2=setr('avx', 'f32', 'tmp2.v0', 'tmp2.v1'),
                      **fmtspec)
        if simd_ext == 'avx2':
            return \
            '''nsimd_{simd_ext}_vlf16 ret;
               ret.v0 = _mm256_castsi256_ps(_mm256_cvtepi16_epi32(
                          _mm256_castsi256_si128({in0})));
               ret.v1 = _mm256_castsi256_ps(_mm256_cvtepi16_epi32(
                          _mm256_extractf128_si256({in0}, 1)));
               return ret;'''.format(**fmtspec)
        if simd_ext in avx512:
            return '''nsimd_{simd_ext}_vlf16 ret;
                      ret.v0 = (__mmask16)({in0} & 0xFFFF);
                      ret.v1 = (__mmask16)(({in0} >> 16) & 0xFFFF);
                      return ret;'''.format(**fmtspec)
    if from_typ == 'f16':
        if simd_ext in sse + avx:
            return '''f32 in[{le}];
                      {to_typ} out[{le}];
                      int i;
                      nsimd_storeu_{simd_ext}_f32(in, {in0}.v0);
                      nsimd_storeu_{simd_ext}_f32(in + {leo2}, {in0}.v1);
                      for (i = 0; i < {le}; i++) {{
                        out[i] = ({to_typ})(in[i] != 0.0f ? -1 : 0);
                      }}
                      return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \
                      format(leo2=int(fmtspec['le']) // 2, **fmtspec)
        if simd_ext in avx512:
            return \
            'return (__mmask32){in0}.v0 | ((__mmask32){in0}.v1 << 16);'. \
            format(**fmtspec)
    if simd_ext in sse + avx:
        return reinterpret1(simd_ext, from_typ, to_typ)
    else:
        return 'return {in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Convert

def convert1(simd_ext, from_typ, to_typ):
    if to_typ == from_typ or \
       to_typ in common.iutypes and from_typ in common.iutypes:
        return 'return {in0};'.format(**fmtspec)
    if to_typ == 'f16':
        if simd_ext in sse:
            getlo = '{in0}'.format(**fmtspec)
            gethi = '_mm_unpackhi_epi64({in0}, {in0})'.format(**fmtspec)
        if simd_ext in avx:
            getlo = '_mm256_castsi256_si128({in0})'.format(**fmtspec)
            gethi = '_mm256_extractf128_si256({in0}, 1)'.format(**fmtspec)
        if simd_ext in avx512:
            getlo = '_mm512_castsi512_si256({in0})'.format(**fmtspec)
            gethi = '_mm512_extracti64x4_epi64({in0}, 1)'.format(**fmtspec)
        through_epi32 = \
        '''nsimd_{simd_ext}_v{to_typ} ret;
           ret.v0 = {pre}cvtepi32_ps({pre}cvtep{from_typ}_epi32({getlo}));
           ret.v1 = {pre}cvtepi32_ps({pre}cvtep{from_typ}_epi32({gethi}));
           return ret;'''.format(getlo=getlo, gethi=gethi, **fmtspec)
        emulate = '''{from_typ} in[{le}];
                     f32 out[{leo2}];
                     nsimd_{simd_ext}_vf16 ret;
                     int i;
                     nsimd_storeu_{simd_ext}_{from_typ}(in, {in0});
                     for (i = 0; i < {leo2}; i++) {{
                       out[i] = (f32)in[i];
                     }}
                     ret.v0 = nsimd_loadu_{simd_ext}_f32(out);
                     for (i = 0; i < {leo2}; i++) {{
                       out[i] = (f32)in[i + {leo2}];
                     }}
                     ret.v1 = nsimd_loadu_{simd_ext}_f32(out);
                     return ret;'''.format(leo2=int(fmtspec['le']) // 2,
                                           **fmtspec)
        if simd_ext in ['sse42', 'avx2']:
            return through_epi32
        if simd_ext in ['sse2', 'avx']:
            return emulate
        if simd_ext in avx512:
            return through_epi32
    if from_typ == 'f16':
        return '''f32 in[{leo2}];
                  {to_typ} out[{le}];
                  int i;
                  nsimd_storeu_{simd_ext}_f32(in, {in0}.v0);
                  for (i = 0; i < {leo2}; i++) {{
                    out[i] = ({to_typ})in[i];
                  }}
                  nsimd_storeu_{simd_ext}_f32(in, {in0}.v1);
                  for (i = 0; i < {leo2}; i++) {{
                    out[i + {leo2}] = ({to_typ})in[i];
                  }}
                  return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \
                  format(leo2=int(fmtspec['le']) // 2, **fmtspec)
    emulate = '''{from_typ} in[{le}];
                 {to_typ} out[{le}];
                 int i;
                 nsimd_storeu_{simd_ext}_{from_typ}(in, {in0});
                 for (i = 0; i < {le}; i++) {{
                   out[i] = ({to_typ})in[i];
                 }}
                 return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \
                 format(**fmtspec)
    if to_typ == 'f64' or from_typ == 'f64':
        if simd_ext == 'avx512_skylake':
            return 'return _mm512_cvt{from_suf}{to_suf}({in0});'. \
                   format(from_suf=suf_ep(from_typ)[1:], to_suf=suf_ep(to_typ),
                          **fmtspec)
        else:
            return emulate
    if to_typ == 'f32' and from_typ == 'i32':
        return 'return {pre}cvtepi32_ps({in0});'.format(**fmtspec)
    if to_typ == 'f32' and from_typ == 'u32':
        if simd_ext in sse + avx:
            return emulate
        if simd_ext in avx512:
            return 'return _mm512_cvtepu32_ps({in0});'.format(**fmtspec)
    if to_typ == 'i32' and from_typ == 'f32':
        return 'return {pre}cvtps_epi32({in0});'.format(**fmtspec)
    if to_typ == 'u32' and from_typ == 'f32':
        if simd_ext in sse + avx:
            return emulate
        if simd_ext in avx512:
            return 'return _mm512_cvtps_epu32({in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# Reciprocal (at least 11 bits of precision)

def rec11_rsqrt11(func, simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_{func}11_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_{func}11_{simd_ext}_f32({in0}.v1);
                  return ret;'''. \
                  format(func='rec' if func == 'rcp' else 'rsqrt', **fmtspec)
    if typ == 'f32':
        if simd_ext in sse + avx:
            return 'return {pre}{func}_ps({in0});'.format(func=func, **fmtspec)
        if simd_ext in avx512:
            return 'return _mm512_{func}14_ps({in0});'. \
                   format(func=func, **fmtspec)
    if typ == 'f64':
        if simd_ext in sse + avx:
            one = '{pre}set1_pd(1.0)'.format(**fmtspec)
            if func == 'rcp':
                return 'return {pre}div{suf}({one}, {in0});'.format(one=one, **fmtspec)
            else:
                return 'return {pre}div{suf}({one}, {pre}sqrt{suf}({in0}));'. \
                        format(one=one, **fmtspec)
            format(func=func, **fmtspec)
        if simd_ext in avx512:
            return 'return _mm512_{func}14_pd({in0});'. \
                   format(func=func, **fmtspec)

# -----------------------------------------------------------------------------
# Reciprocal (IEEE)

def rec1(simd_ext, typ):
    one = '{pre}set1_ps(1.0f)'.format(**fmtspec) if typ in ['f16', 'f32'] \
          else '{pre}set1_pd(1.0)'.format(**fmtspec)
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  nsimd_{simd_ext}_vf32 one = {one};
                  ret.v0 = {pre}div_ps(one, {in0}.v0);
                  ret.v1 = {pre}div_ps(one, {in0}.v1);
                  return ret;'''.format(one=one, **fmtspec)
    return 'return {pre}div{suf}({one}, {in0});'.format(one=one, **fmtspec)

# -----------------------------------------------------------------------------
# Negative

def neg1(simd_ext, typ):
    cte = '0x80000000' if typ in ['f16', 'f32'] else '0x8000000000000000'
    fsuf = '_ps' if typ in ['f16', 'f32'] else '_pd'
    utyp = 'u32' if typ in ['f16', 'f32'] else 'u64'
    vmask = '{pre}castsi{nbits}{fsuf}(nsimd_set1_{simd_ext}_{utyp}({cte}))'. \
            format(cte=cte, utyp=utyp, fsuf=fsuf, **fmtspec)
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  nsimd_{simd_ext}_vf32 mask = {vmask};
                  ret.v0 = nsimd_xorb_{simd_ext}_f32(mask, {in0}.v0);
                  ret.v1 = nsimd_xorb_{simd_ext}_f32(mask, {in0}.v1);
                  return ret;'''.format(vmask=vmask, **fmtspec)
    if typ in ['f32', 'f64']:
        return 'return nsimd_xorb_{simd_ext}_{typ}({vmask}, {in0});'. \
               format(vmask=vmask, **fmtspec)
    return '''return nsimd_sub_{simd_ext}_{typ}(
                  {pre}setzero_si{nbits}(), {in0});'''. \
              format(**fmtspec)

# -----------------------------------------------------------------------------
# nbtrue

def nbtrue1(simd_ext, typ):
    if typ == 'f16':
        return '''return nsimd_nbtrue_{simd_ext}_f32({in0}.v0) +
                         nsimd_nbtrue_{simd_ext}_f32({in0}.v1);'''. \
                         format(**fmtspec)
    if typ in ['i8', 'u8']:
        code = 'return nsimd_popcnt32_((u32){pre}movemask_epi8({in0}));'. \
               format(**fmtspec)
    elif typ in ['i16', 'u16']:
        code = 'return nsimd_popcnt32_((u32){pre}movemask_epi8({in0})) >> 1;'. \
               format(**fmtspec)
    elif typ in ['i32', 'u32', 'i64', 'u64']:
        code = '''return nsimd_popcnt32_((u32){pre}movemask{fsuf}(
                      {pre}castsi{nbits}{fsuf}({in0})));'''. \
                      format(fsuf='_ps' if typ in ['i32', 'u32'] else '_pd',
                             **fmtspec)
    else:
        code = 'return nsimd_popcnt32_((u32){pre}movemask{suf}({in0}));'. \
               format(**fmtspec)
    if simd_ext in sse:
        return code
    if simd_ext in avx:
        if typ in ['i32', 'u32', 'i64', 'u64', 'f32', 'f64']:
            return code
        else:
            if simd_ext == 'avx2':
                return code
            else:
                return \
                '''return nsimd_nbtrue_sse42_{typ}(
                            _mm256_castsi256_si128({in0})) +
                              nsimd_nbtrue_sse42_{typ}(
                                _mm256_extractf128_si256({in0}, 1));'''. \
                                format(**fmtspec)
    if simd_ext in avx512:
        return 'return nsimd_popcnt64_((u64){in0});'.format(**fmtspec)

# -----------------------------------------------------------------------------
# reverse

def reverse1(simd_ext, typ):
    # 8-bit int
    if typ in ['i8', 'u8']:
        if simd_ext == 'sse2':
            return '''{in0} = _mm_shufflehi_epi16({in0}, _MM_SHUFFLE(0,1,2,3));
                      {in0} = _mm_shufflelo_epi16({in0}, _MM_SHUFFLE(0,1,2,3));
                      {in0} = _mm_castpd_si128(_mm_shuffle_pd(
                                _mm_castsi128_pd({in0}), _mm_castsi128_pd(
                                  {in0}), 1));
                      nsimd_{simd_ext}_v{typ} r0 = _mm_srli_epi16({in0}, 8);
                      nsimd_{simd_ext}_v{typ} r1 = _mm_slli_epi16({in0}, 8);
                      return _mm_or_si128(r0, r1);'''.format(**fmtspec)
        elif simd_ext == 'sse42':
            return '''nsimd_{simd_ext}_v{typ} mask = _mm_set_epi8(
                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                      return _mm_shuffle_epi8({in0}, mask);'''. \
                      format(**fmtspec)
        elif simd_ext == 'avx':
            return \
            '''nsimd_sse42_v{typ} r0 = _mm_shuffle_epi8(
                 _mm256_extractf128_si256({in0}, 0), _mm_set_epi8(
                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
               nsimd_sse42_v{typ} r1 = _mm_shuffle_epi8(
                 _mm256_extractf128_si256({in0}, 1), _mm_set_epi8(
                   0,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
               {in0} = _mm256_insertf128_si256({in0}, r0, 1);
               return _mm256_insertf128_si256({in0}, r1, 0);'''. \
               format(**fmtspec)
        elif simd_ext == 'avx2':
             return \
             '''{in0} = _mm256_shuffle_epi8({in0}, _mm256_set_epi8(
                   0,  1,  2,  3,  4,  5,  6,  7,
                   8,  9, 10, 11, 12, 13, 14, 15,
                  16, 17, 18, 19, 20, 21, 22, 23,
                  24, 25, 26, 27, 28, 29, 30, 31));
                return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \
                format(**fmtspec)
        # AVX-512F and above.
        else:
             return \
             '''nsimd_avx2_v{typ} r0 = _mm512_extracti64x4_epi64({in0}, 0);
                nsimd_avx2_v{typ} r1 = _mm512_extracti64x4_epi64({in0}, 1);
                r0 = _mm256_shuffle_epi8(r0, _mm256_set_epi8(
                     0,  1,  2,  3,  4,  5,  6,  7,
                     8,  9, 10, 11, 12, 13, 14, 15,
                    16, 17, 18, 19, 20, 21, 22, 23,
                    24, 25, 26, 27, 28, 29, 30, 31));
                r1 = _mm256_shuffle_epi8(r1, _mm256_set_epi8(
                      0,  1,  2,  3,  4,  5,  6,  7,
                      8,  9, 10, 11, 12, 13, 14, 15,
                     16, 17, 18, 19, 20, 21, 22, 23,
                     24, 25, 26, 27, 28, 29, 30, 31));
                r0 = _mm256_permute2x128_si256(r0, r0, 1);
                r1 = _mm256_permute2x128_si256(r1, r1, 1);
                {in0} = _mm512_insertf64x4({in0}, r0, 1);
                return _mm512_insertf64x4({in0}, r1, 0);'''.format(**fmtspec)
    # 16-bit int
    elif typ in ['i16', 'u16']:
        if simd_ext == 'sse2':
            return '''{in0} = _mm_shufflehi_epi16( {in0}, _MM_SHUFFLE(0,1,2,3) );
                      {in0} = _mm_shufflelo_epi16( {in0}, _MM_SHUFFLE(0,1,2,3) );
                      return _mm_castpd_si128(_mm_shuffle_pd(
                               _mm_castsi128_pd({in0}),
                               _mm_castsi128_pd({in0}), 1));'''. \
                               format(**fmtspec)
        elif simd_ext == 'sse42':
            return \
            '''return _mm_shuffle_epi8({in0}, _mm_set_epi8(
                        1,  0,  3,  2,  5,  4,  7, 6,
                        9,  8, 11, 10, 13, 12, 15, 14));'''.format(**fmtspec)
        elif simd_ext == 'avx':
            return \
            '''nsimd_sse42_v{typ} r0 = _mm_shuffle_epi8(
                 _mm256_extractf128_si256({in0}, 0), _mm_set_epi8(
                   1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
               nsimd_sse42_v{typ} r1 = _mm_shuffle_epi8(
                 _mm256_extractf128_si256({in0}, 1), _mm_set_epi8(
                   1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
               {in0} = _mm256_insertf128_si256( {in0}, r0, 1);
               return _mm256_insertf128_si256({in0}, r1, 0);'''. \
               format(**fmtspec)
        elif simd_ext == 'avx2':
            return \
            '''{in0} = _mm256_shuffle_epi8({in0}, _mm256_set_epi8(
                           1,  0,  3,  2,  5,  4,  7,  6,
                           9,  8, 11, 10, 13, 12, 15, 14,
                          17, 16, 19, 18, 21, 20, 23, 22,
                          25, 24, 27, 26, 29, 28, 31, 30));
               return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \
               format(**fmtspec)
        # AVX-512F
        elif simd_ext == 'avx512_knl':
            return \
            '''{in0} = _mm512_permutexvar_epi32(_mm512_set_epi32(
                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
                 {in0});
               nsimd_{simd_ext}_v{typ} r0 = _mm512_srli_epi32({in0}, 16);
               nsimd_{simd_ext}_v{typ} r1 = _mm512_slli_epi32({in0}, 16);
               return _mm512_or_si512(r0, r1);'''.format(**fmtspec)
        # AVX-512F+BW (Skylake) + WORKAROUND GCC<=8
        else:
            return \
            '''return _mm512_permutexvar_epi16(_mm512_set_epi32(
                 (0<<16)  | 1,  (2<<16)  | 3,  (4<<16)  | 5,  (6<<16)  | 7,
                 (8<<16)  | 9,  (10<<16) | 11, (12<<16) | 13, (14<<16) | 15,
                 (16<<16) | 17, (18<<16) | 19, (20<<16) | 21, (22<<16) | 23,
                 (24<<16) | 25, (26<<16) | 27, (28<<16) | 29, (30<<16) | 31),
                 {in0} );'''.format(**fmtspec)
    # 32-bit int
    elif typ in ['i32', 'u32']:
        if simd_ext in ['sse2', 'sse42']:
            return 'return _mm_shuffle_epi32({in0}, _MM_SHUFFLE(0,1,2,3));'. \
                   format(**fmtspec)
        elif simd_ext == 'avx':
            return '''{in0} = _mm256_castps_si256(_mm256_shuffle_ps(
                                _mm256_castsi256_ps({in0}),
                                _mm256_castsi256_ps({in0}),
                                _MM_SHUFFLE(0,1,2,3)));
                      return _mm256_permute2f128_si256({in0}, {in0}, 1);'''. \
                      format(**fmtspec)
        elif simd_ext == 'avx2':
            return \
            '''{in0} = _mm256_shuffle_epi32({in0}, _MM_SHUFFLE(0,1,2,3));
               return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \
               format(**fmtspec)
        else:
            return \
            '''return _mm512_permutexvar_epi32(_mm512_set_epi32(
                 0, 1,  2,  3,  4,  5,  6,  7,
                 8, 9, 10, 11, 12, 13, 14, 15), {in0});'''. \
                 format(**fmtspec)
    elif typ in ['i64', 'u64']:
        if simd_ext in ['sse2', 'sse42']:
            return '''return _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(
                               {in0}), _mm_castsi128_pd({in0}), 1));'''. \
                               format(**fmtspec)
        elif simd_ext == 'avx':
            return '''{in0} = _mm256_castpd_si256(
                                  _mm256_shuffle_pd(
                                     _mm256_castsi256_pd({in0}),
                                     _mm256_castsi256_pd({in0}),
                                     (1<<2) | 1
                                  )
                              );
                       return _mm256_permute2f128_si256({in0}, {in0}, 1);'''. \
                       format(**fmtspec)
        elif simd_ext == 'avx2':
           return '''return _mm256_permute4x64_epi64({in0},
                              _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec)
        else:
           return '''return _mm512_permutexvar_epi64(_mm512_set_epi64(
                              0, 1, 2, 3, 4, 5, 6, 7), {in0});'''. \
                              format(**fmtspec)
    # 16-bit float
    elif typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_reverse_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_reverse_{simd_ext}_f32({in0}.v1);
                  return ret;'''.format(**fmtspec)
    # 32-bit float
    elif typ == 'f32':
        if simd_ext in ['sse2', 'sse42']:
            return '''return _mm_shuffle_ps({in0}, {in0},
                               _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec)
        elif simd_ext in ['avx', 'avx2']:
            return '''{in0} = _mm256_shuffle_ps({in0}, {in0},
                                _MM_SHUFFLE(0, 1, 2, 3));
                      return _mm256_permute2f128_ps({in0}, {in0}, 1);'''. \
                      format(**fmtspec)
        else:
            return \
            '''return _mm512_permutexvar_ps(_mm512_set_epi32(
                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
                        {in0} );'''.format(**fmtspec)
    # 64-bit float
    else:
        if simd_ext in ['sse2', 'sse42']:
            return 'return _mm_shuffle_pd({in0}, {in0}, 1);'.format(**fmtspec)
        elif simd_ext == 'avx':
            return '''{in0} = _mm256_shuffle_pd({in0}, {in0}, (1<<2) | 1);
                      return _mm256_permute2f128_pd({in0}, {in0}, 1);'''. \
                      format(**fmtspec)
        elif simd_ext == 'avx2':
            return '''return _mm256_permute4x64_pd({in0},
                               _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec)
        else:
            return '''return _mm512_permute_mm512_set_epi64(
                               0, 1, 2, 3, 4, 5, 6, 7), {in0});'''. \
                               format(**fmtspec)

# -----------------------------------------------------------------------------
# addv

def addv(simd_ext, typ):
    if simd_ext in sse:
        if typ == 'f64':
            return \
            '''return _mm_cvtsd_f64(_mm_add_pd({in0},
                                    _mm_shuffle_pd({in0}, {in0}, 0x01)));'''. \
                                    format(**fmtspec)
        elif typ == 'f32':
            return \
            '''nsimd_{simd_ext}_vf32 tmp = _mm_add_ps({in0}, _mm_shuffle_ps(
                                             {in0}, {in0}, 0xb1));
               return _mm_cvtss_f32(_mm_add_ps(tmp, _mm_shuffle_ps(
                        tmp, tmp, 0x4e)));''' .format(**fmtspec)
        elif typ == 'f16':
            return \
            '''nsimd_{simd_ext}_vf32 tmp0 = _mm_add_ps({in0}.v0,
                 _mm_shuffle_ps({in0}.v0, {in0}.v0, 0xb1));
               nsimd_{simd_ext}_vf32 tmp1 = _mm_add_ps({in0}.v1,
                 _mm_shuffle_ps({in0}.v1, {in0}.v1, 0xb1));
               return nsimd_f32_to_f16(_mm_cvtss_f32(_mm_add_ps(
                 tmp0, _mm_shuffle_ps(tmp0, tmp0, 0x4e))) +
                   _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(
                     tmp1, tmp1, 0x4e))));''' .format(**fmtspec)
    elif simd_ext in avx:
        if typ == 'f64':
            return \
            '''__m128d tmp = _mm_add_pd(_mm256_extractf128_pd({in0}, 1),
                                        _mm256_extractf128_pd({in0}, 0));
               return _mm_cvtsd_f64(_mm_add_pd(tmp, _mm_shuffle_pd(
                        tmp, tmp, 0x01)));''' .format(**fmtspec)
        elif typ == 'f32':
            return \
            '''__m128 tmp0 = _mm_add_ps(_mm256_extractf128_ps({in0}, 1),
                                        _mm256_extractf128_ps({in0}, 0));
               __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(tmp0, tmp0, 0xb1));
               return _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(
                        tmp1, tmp1, 0x4e)));''' .format(**fmtspec)
        elif typ == 'f16':
            return \
            '''__m128 tmp00 = _mm_add_ps(_mm256_extractf128_ps({in0}.v0, 1),
                                         _mm256_extractf128_ps({in0}.v0, 0));
               __m128 tmp01 = _mm_add_ps(tmp00, _mm_shuffle_ps(
                                tmp00, tmp00, 0xb1));
               __m128 tmp10 = _mm_add_ps(_mm256_extractf128_ps({in0}.v1, 1),
                                         _mm256_extractf128_ps({in0}.v1, 0));
               __m128 tmp11 = _mm_add_ps(tmp10, _mm_shuffle_ps(
                                tmp10, tmp10, 0xb1));
               return nsimd_f32_to_f16(_mm_cvtss_f32(_mm_add_ps(
                        tmp01, _mm_shuffle_ps(tmp01, tmp01, 0x4e))) +
                          _mm_cvtss_f32(_mm_add_ps(tmp11, _mm_shuffle_ps(
                            tmp11, tmp11, 0x4e))));
                    ''' .format(**fmtspec)
    elif simd_ext in avx512:
        if typ == 'f64':
            return \
            '''__m256d tmp0 = _mm256_add_pd(_mm512_extractf64x4_pd({in0}, 0),
                                            _mm512_extractf64x4_pd({in0}, 1));
               __m128d tmp1 = _mm_add_pd(_mm256_extractf128_pd(tmp0, 1),
                                         _mm256_extractf128_pd(tmp0, 0));
               return _mm_cvtsd_f64(_mm_add_pd(tmp1, _mm_shuffle_pd(
                        tmp1, tmp1, 0x01)));''' .format(**fmtspec)
        elif typ == 'f32':
            return \
            '''__m128 tmp0 = _mm_add_ps(_mm_add_ps(_mm512_extractf32x4_ps(
                               {in0}, 0), _mm512_extractf32x4_ps({in0}, 1)),
                               _mm_add_ps(_mm512_extractf32x4_ps({in0}, 2),
                               _mm512_extractf32x4_ps({in0}, 3)));
               __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(
                               tmp0, tmp0, 0xb1));
               return _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(
                        tmp1, tmp1, 0x4e)));''' .format(**fmtspec)
        elif typ == 'f16':
            return \
            '''f32 res;
               __m128 tmp0 = _mm_add_ps(
                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v0, 0),
                               _mm512_extractf32x4_ps({in0}.v0, 1)),
                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v0, 2),
                               _mm512_extractf32x4_ps({in0}.v0, 3)));
               __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(
                               tmp0, tmp0, 0xb1));
               res = _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(
                       tmp1, tmp1, 0x4e)));
               tmp0 = _mm_add_ps(
                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v1, 0),
                               _mm512_extractf32x4_ps({in0}.v1, 1)),
                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v1, 2),
                               _mm512_extractf32x4_ps({in0}.v1, 3)));
               tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(tmp0, tmp0, 0xb1));
               return nsimd_f32_to_f16(res + _mm_cvtss_f32(_mm_add_ps(
                        tmp1, _mm_shuffle_ps(tmp1, tmp1, 0x4e))));''' . \
                        format(**fmtspec)

# -----------------------------------------------------------------------------
# upconvert

def upcvt1(simd_ext, from_typ, to_typ):
    # From f16 is easy
    if from_typ == 'f16':
        if to_typ == 'f32':
            return \
            '''nsimd_{simd_ext}_vf32x2 ret;
               ret.v0 = {in0}.v0;
               ret.v1 = {in0}.v1;
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_{simd_ext}_v{to_typ}x2 ret;
               ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_f32({in0}.v0);
               ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_f32({in0}.v1);
               return ret;'''.format(**fmtspec)

    # To f16 is easy
    if to_typ == 'f16':
        return \
        '''nsimd_{simd_ext}_vf16x2 ret;
           nsimd_{simd_ext}_v{iu}16x2 buf;
           buf = nsimd_upcvt_{simd_ext}_{iu}16_{iu}8({in0});
           ret.v0 = nsimd_cvt_{simd_ext}_f16_{iu}16(buf.v0);
           ret.v1 = nsimd_cvt_{simd_ext}_f16_{iu}16(buf.v1);
           return ret;'''.format(iu=from_typ[0], **fmtspec)

    # For integer upcast, due to 2's complement representation
    # epi_epi : signed   -> bigger signed
    # epi_epi : signed   -> bigger unsigned
    # epu_epi : unsigned -> bigger signed
    # epu_epi : unsigned -> bigger unsigned
    if from_typ in common.iutypes:
        suf_epep = 'ep{ui}{typnbits}_epi{typnbits2}'. \
                   format(ui='u' if from_typ in common.utypes else 'i',
                          typnbits2=str(int(fmtspec['typnbits']) * 2),
                          **fmtspec)
    else:
        suf_epep = 'ps_pd'

    # compute lower half
    if simd_ext in sse:
        lower_half = '{in0}'.format(**fmtspec)
    else:
        lower_half = extract(simd_ext, from_typ, LO, fmtspec['in0'])

    # compute upper half
    if simd_ext in sse:
        if from_typ in common.iutypes:
            upper_half = '_mm_shuffle_epi32({in0}, 14 /* 2 | 3 */)'. \
                         format(**fmtspec)
        else:
            upper_half = '''{pre}castpd_ps({pre}shuffle_pd(
                                {pre}castps_pd({in0}),
                                {pre}castps_pd({in0}), 1))'''.format(**fmtspec)
    else:
        upper_half = extract(simd_ext, from_typ, HI, fmtspec['in0'])

    # When intrinsics are provided
    # for conversions integers <-> floating point, there is no intrinsics, so
    # we use cvt's
    if from_typ == 'i32' and to_typ == 'f64':
        with_intrinsic = \
        '''nsimd_{simd_ext}_vf64x2 ret;
           ret.v0 = {pre}cvtepi32_pd({lower_half});
           ret.v1 = {pre}cvtepi32_pd({upper_half});
           return ret;'''.format(upper_half=upper_half,
                                 lower_half=lower_half, **fmtspec)
    elif (from_typ in common.iutypes and to_typ in common.iutypes) or \
         (from_typ == 'f32' and to_typ == 'f64'):
        with_intrinsic = \
        '''nsimd_{simd_ext}_v{to_typ}x2 ret;
           ret.v0 = {pre}cvt{suf_epep}({lower_half});
           ret.v1 = {pre}cvt{suf_epep}({upper_half});
           return ret;'''.format(upper_half=upper_half, lower_half=lower_half,
                                 suf_epep=suf_epep, **fmtspec)
    else:
        from_typ2 = from_typ[0] + str(int(fmtspec['typnbits']) * 2)
        if from_typ not in common.iutypes:
            # getting here means that from_typ=f32 and to_typ=f64
            with_intrinsic = \
            '''nsimd_{simd_ext}_vf64x2 ret;
               ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_f64({pre}cvtps_pd(
                            {lower_half}));
               ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_f64({pre}cvtps_pd(
                            {upper_half}));
               return ret;'''. \
               format(upper_half=upper_half, lower_half=lower_half,
                      from_typ2=from_typ2, suf_epep=suf_epep, **fmtspec)

    # When no intrinsic is given for going from integers to floating or
    # from floating to integer we can go through a cvt
    if to_typ in common.ftypes:
        int_float = \
        '''nsimd_{simd_ext}_v{to_typ}x2 ret;
           nsimd_{simd_ext}_v{int_typ}x2 tmp;
           tmp = nsimd_upcvt_{simd_ext}_{int_typ}_{from_typ}({in0});
           ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v0);
           ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v1);
           return ret;'''. \
           format(int_typ=from_typ[0] + to_typ[1:], lower_half=lower_half,
                  upper_half=upper_half, **fmtspec)
    else:
        int_float = \
        '''return nsimd_upcvt_{simd_ext}_{to_typ}_{int_typ}(
                      nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in0}));'''. \
                      format(int_typ=to_typ[0] + from_typ[1:],
                             lower_half=lower_half, upper_half=upper_half,
                             **fmtspec)

    # When no intrinsic is given we can use the trick of falling back to
    # the lower SIMD extension
    split_trick = \
    '''nsimd_{simd_ext}_v{to_typ}x2 ret;
       nsimd_{simd_ext2}_v{to_typ}x2 ret2;
       ret2 = nsimd_upcvt_{simd_ext2}_{to_typ}_{from_typ}({lo});
       ret.v0 = {merge};
       ret2 = nsimd_upcvt_{simd_ext2}_{to_typ}_{from_typ}({hi});
       ret.v1 = {merge};
       return ret;'''. \
       format(simd_ext2='sse42' if simd_ext == 'avx' else 'avx2',
              lo=extract(simd_ext, from_typ, LO, common.in0),
              hi=extract(simd_ext, from_typ, HI, common.in0),
              merge=setr(simd_ext, to_typ, 'ret2.v0', 'ret2.v1'), **fmtspec)

    # return C code
    if from_typ == 'i32' and to_typ == 'f64':
        return with_intrinsic
    if (from_typ in common.ftypes and to_typ in common.iutypes) or \
       (from_typ in common.iutypes and to_typ in common.ftypes):
        return int_float
    # if simd_ext == 'sse2':
    if simd_ext in sse:
        if from_typ in common.itypes and to_typ in common.iutypes:
            return \
            '''nsimd_{simd_ext}_v{to_typ}x2 ret;
               __m128i mask = _mm_cmpgt{suf}(_mm_setzero_si128(), {in0});
               ret.v0 = _mm_unpacklo{suf}({in0}, mask);
               ret.v1 = _mm_unpackhi{suf}({in0}, mask);
               return ret;'''.format(**fmtspec)
        elif from_typ in common.utypes and to_typ in common.iutypes:
            return \
            '''nsimd_{simd_ext}_v{to_typ}x2 ret;
               ret.v0 = _mm_unpacklo{suf}({in0}, _mm_setzero_si128());
               ret.v1 = _mm_unpackhi{suf}({in0}, _mm_setzero_si128());
               return ret;'''.format(**fmtspec)
        else:
            return with_intrinsic
    # elif simd_ext == 'sse42':
    #    return with_intrinsic
    elif simd_ext == 'avx':
        if from_typ == 'i32' and to_typ == 'f64':
            return with_intrinsic
        else:
            return split_trick
    elif simd_ext == 'avx2':
        return with_intrinsic
    elif simd_ext == 'avx512_knl':
        if from_typ in ['i16', 'u16', 'i32', 'u32', 'f32']:
            return with_intrinsic
        else:
            return split_trick
    else:
        return with_intrinsic

# -----------------------------------------------------------------------------
# downconvert

def downcvt1(opts, simd_ext, from_typ, to_typ):
    # From f16 is easy
    if from_typ == 'f16':
        le_to_typ = int(fmtspec['le']) * 2
        le_1f32 = le_to_typ // 4
        le_2f32 = 2 * le_to_typ // 4
        le_3f32 = 3 * le_to_typ // 4
        cast = castsi(simd_ext, to_typ)
        return \
        '''{to_typ} dst[{le_to_typ}];
           f32 src[{le_to_typ}];
           int i;
           {pre}storeu_ps(src, {in0}.v0);
           {pre}storeu_ps(src + {le_1f32}, {in0}.v1);
           {pre}storeu_ps(src + {le_2f32}, {in1}.v0);
           {pre}storeu_ps(src + {le_3f32}, {in1}.v1);
           for (i = 0; i < {le_to_typ}; i++) {{
             dst[i] = ({to_typ})src[i];
           }}
           return {pre}loadu_si{nbits}({cast}dst);'''. \
           format(le_to_typ=le_to_typ, le_1f32=le_1f32, le_2f32=le_2f32,
                  le_3f32=le_3f32, cast=cast, **fmtspec)

    # To f16 is easy
    if to_typ == 'f16':
        if from_typ == 'f32':
            return \
            '''nsimd_{simd_ext}_vf16 ret;
               ret.v0 = {in0};
               ret.v1 = {in1};
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_{simd_ext}_vf16 ret;
               ret.v0 = nsimd_cvt_{simd_ext}_f32_{from_typ}({in0});
               ret.v1 = nsimd_cvt_{simd_ext}_f32_{from_typ}({in1});
               return ret;'''.format(**fmtspec)

    # f64 --> f32 have intrinsics
    if from_typ == 'f64' and to_typ == 'f32':
        if simd_ext in sse:
            return '''return _mm_movelh_ps(_mm_cvtpd_ps({in0}),
                                           _mm_cvtpd_ps({in1}));'''. \
                                           format(**fmtspec)
        else:
            return 'return {};'.format(setr(simd_ext, 'f32',
                                '{pre}cvtpd_ps({in0})'.format(**fmtspec),
                                '{pre}cvtpd_ps({in1})'.format(**fmtspec)))

    # integer conversions intrinsics are only available with AVX-512
    if simd_ext in avx512:
        if (from_typ in ['i32', 'i64'] and to_typ in common.itypes) or \
           (simd_ext == 'avx512_skylake' and from_typ == 'i16' and \
            to_typ == 'i8'):
            return 'return {};'.format(setr(simd_ext, to_typ,
                   '{pre}cvtep{from_typ}_ep{to_typ}({in0})'.format(**fmtspec),
                   '{pre}cvtep{from_typ}_ep{to_typ}({in1})'.format(**fmtspec)))
        elif from_typ == 'i64' and to_typ == 'f32':
            return 'return nsimd_cvt_{simd_ext}_f32_i32({});'. \
                   format(setr(simd_ext, from_typ,
                          '{pre}cvtepi64_epi32({in0})'.format(**fmtspec),
                          '{pre}cvtepi64_epi32({in1})'.format(**fmtspec)),
                          **fmtspec)

    # and then emulation
    le_to_typ = 2 * int(fmtspec['le'])
    cast_src = '(__m{nbits}i *)'.format(**fmtspec) \
               if from_typ in common.iutypes else ''
    cast_dst = '(__m{nbits}i *)'.format(**fmtspec) \
               if to_typ in common.iutypes else ''
    return \
    '''{to_typ} dst[{le_to_typ}];
       {from_typ} src[{le_to_typ}];
       int i;
       {pre}storeu{sufsi}({cast_src}src, {in0});
       {pre}storeu{sufsi}({cast_src}(src + {le}), {in1});
       for (i = 0; i < {le_to_typ}; i++) {{
         dst[i] = ({to_typ})src[i];
       }}
       return {pre}loadu{sufsi_to_typ}({cast_dst}dst);'''. \
       format(cast_src=cast_src, cast_dst=cast_dst, le_to_typ=le_to_typ,
              sufsi_to_typ=suf_si(simd_ext, to_typ), **fmtspec)

# -----------------------------------------------------------------------------
# adds / subs helper

def adds_subs_intrinsic_instructions_i8_i16_u8_u16(which_op, simd_ext, typ):

    valid_types = ('i8', 'i16', 'u8', 'u16')
    if typ not in valid_types:
        raise TypeError(
    '''def adds_subs_intrinsic_instructions_i8_i16_u8_u16(...):
     {typ} must belong to the following types set: {valid_types}'''.\
        format(typ=typ, valid_types=valid_types)
    )
    if 'sse2' in simd_ext or 'sse42' in simd_ext:
        return'''
        return _mm_{which_op}_ep{typ}({in0}, {in1});
        '''.format(which_op=which_op, **fmtspec)
    if 'avx' == simd_ext:
        return split_opn(which_op, simd_ext, typ, 2)
    if simd_ext in ('avx2', 'avx512_skylake'):
        return 'return {pre}{which_op}_ep{typ}({in0}, {in1});'. \
            format(which_op=which_op, **fmtspec)
    if 'avx512_knl' == simd_ext:
        return split_opn(which_op, simd_ext, typ, 2)

def get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ):
    if 'avx512' in simd_ext or 'sse2' in simd_ext:
        mask_processing = \
        '''/* For avx512/sse2 */
           const nsimd_{simd_ext}_vu{typnbits} mask_strong_bit =
               nsimd_shr_{simd_ext}_u{typnbits}(
                   mask, sizeof(u{typnbits}) * CHAR_BIT - 1);
           const nsimd_{simd_ext}_vi{typnbits} imask_strong_bit =
               nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(
                   mask_strong_bit);
           const nsimd_{simd_ext}_vli{typnbits} limask_strong_bit =
               nsimd_to_logical_{simd_ext}_i{typnbits}(imask_strong_bit);'''. \
               format(**fmtspec)
        if_else = \
        '''/* For avx512/sse2 */
           return nsimd_if_else1_{simd_ext}_i{typnbits}(
                      limask_strong_bit, ires, i_max_min);'''. \
                      format(**fmtspec)
    else:
        mask_processing = '/* Before avx512: is_same(__m128i, ' \
                          'vector<signed>, vector<unsigned>, ' \
                          'vector<logical>) */'
        suf2 = 'ps' if typ in ['i32', 'u32'] else 'pd'
        if_else = '''return {pre}cast{suf2}_si{nbits}({pre}blendv_{suf2}(
                                {pre}castsi{nbits}_{suf2}(i_max_min),
                                {pre}castsi{nbits}_{suf2}(ires),
                                {pre}castsi{nbits}_{suf2}(mask)));
                                '''.format(suf2=suf2, **fmtspec)

    return { 'mask_processing': mask_processing, 'if_else': if_else }

# -----------------------------------------------------------------------------
# adds

def adds(simd_ext, typ):

    if typ in common.ftypes:
        return 'return nsimd_add_{simd_ext}_{typ}({in0}, {in1});'. \
               format(**fmtspec)

    if typ in ('i8', 'i16', 'u8', 'u16'):
        return adds_subs_intrinsic_instructions_i8_i16_u8_u16(
                   'adds', simd_ext, typ)

    if typ in common.utypes:
        return \
        '''/* Algo pseudo code: */
           /* ures = a + b */
           /* if overflow then ures < a && ures < b */
           /* --> test against a single value: if(ures < a){{ overflow ; }} */
           /* return ures < a ? {type_max} : ures */

           const nsimd_{simd_ext}_v{typ} ures =
               nsimd_add_{simd_ext}_{typ}({in0}, {in1});
           const nsimd_{simd_ext}_v{typ} type_max =
               nsimd_set1_{simd_ext}_{typ}(({typ}){type_max});
           return nsimd_if_else1_{simd_ext}_{typ}(
                    nsimd_lt_{simd_ext}_{typ}(ures, {in0}),
                    type_max, ures);'''. \
                    format(type_max=common.limits[typ]['max'], **fmtspec)

    avx512_sse2_i32_i64_dependent_code = \
        get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ)

    return \
    '''/* Algo pseudo code: */

       /* if ( ( same_sign(ux, uy) && same_sign(uy, res) ) || */
       /*      ! same_sign(ux, uy) ): */
       /*     neither overflow nor underflow happened */
       /* else: */
       /*     if(ux > 0 && uy > 0): res = MAX // overflow */
       /*     else: res = MIN // underflow */

       /* Step 1: reinterpret to unsigned to work with the bits */

       nsimd_{simd_ext}_vu{typnbits} ux =
           nsimd_reinterpret_{simd_ext}_u{typnbits}_i{typnbits}({in0});
       const nsimd_{simd_ext}_vu{typnbits} uy =
           nsimd_reinterpret_{simd_ext}_u{typnbits}_i{typnbits}({in1});
       const nsimd_{simd_ext}_vu{typnbits} ures =
           nsimd_add_{simd_ext}_u{typnbits}(ux, uy);

       /* Step 2: check signs different: ux, uy, res */

       /* xor_ux_uy's most significant bit will be zero if both ux and */
       /* uy have same sign */

       const nsimd_{simd_ext}_vu{typnbits} xor_ux_uy =
           nsimd_xorb_{simd_ext}_u{typnbits}(ux, uy);

       /* xor_uy_res's most significant bit will be zero if both uy and */
       /* ures have same sign */

       const nsimd_{simd_ext}_vu{typnbits} xor_uy_res =
           nsimd_xorb_{simd_ext}_u{typnbits}(uy, ures);

       /* Step 3: Construct the MIN/MAX vector */

       /* Pseudo code: */

       /* Both positive --> overflow possible */
       /* --> get the MAX: */

       /* (signed)ux >= 0 && (signed)uy >= 0 */
       /* <=> ((unsigned)ux | (unsigned)uy) >> 31 == 0 */
       /* --> MAX + ( (ux | uy) >> 31 ) == MAX + 0 == MAX */

       /* At least one negative */
       /* --> overflow not possible / underflow possible if both negative */
       /* --> get the MIN: */

       /* unsigned tmp = (unsigned)MAX + */
       /*                ( ( (ux | uy) >> 31 ) == (unsigned)MAX + 1 ) */
       /* --> MIN = (reinterpret signed)tmp */

       /* ux | uy */
       const nsimd_{simd_ext}_vu{typnbits} ux_uy_orb =
           nsimd_orb_{simd_ext}_u{typnbits}(ux, uy);

       /* (ux | uy) >> 31 --> Vector of 0's and 1's */
       const nsimd_{simd_ext}_vu{typnbits} u_zeros_ones =
           nsimd_shr_{simd_ext}_u{typnbits}(
               ux_uy_orb, sizeof(u{typnbits}) * CHAR_BIT - 1);

       /* MIN/MAX vector */

       /* i{typnbits} tmp = sMAX + 1 --> undefined behavior */
       /* u{typnbits} tmp = (u{typnbits})sMAX + 1 */
       /* i{typnbits} sMIN = *(i{typnbits}*)(&tmp) */

       const nsimd_{simd_ext}_vu{typnbits} u_max =
           nsimd_set1_{simd_ext}_u{typnbits}((u{typnbits}){type_max});
       const nsimd_{simd_ext}_vu{typnbits} u_max_min =
           nsimd_add_{simd_ext}_u{typnbits}(u_max, u_zeros_ones);
       const nsimd_{simd_ext}_vi{typnbits} i_max_min =
           nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(u_max_min);

       /* Step 4: Construct the mask vector */

       /* mask == ( 8ot_same_sign(ux, uy) || same_sign(uy, res) ) */
       /* mask: True (no underflow/overflow) / False (underflow/overflow) */
       /* mask = xor_ux_uy | ~ xor_uy_res */

       const nsimd_{simd_ext}_vu{typnbits} not_xor_uy_res =
           nsimd_notb_{simd_ext}_u{typnbits}(xor_uy_res);
       const nsimd_{simd_ext}_vu{typnbits} mask =
           nsimd_orb_{simd_ext}_u{typnbits}(xor_ux_uy, not_xor_uy_res);

       {avx512_sse2_dependent_mask_processing}

       /* Step 5: Apply the Mask */

       const nsimd_{simd_ext}_vi{typnbits} ires =
           nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(ures);

       {avx512_sse2_dependent_if_else}'''. \
       format(type_max = common.limits[typ]['max'],
              avx512_sse2_dependent_mask_processing = \
                  avx512_sse2_i32_i64_dependent_code['mask_processing'],
              avx512_sse2_dependent_if_else = \
                  avx512_sse2_i32_i64_dependent_code['if_else'], **fmtspec)

# -----------------------------------------------------------------------------
# subs

def subs(simd_ext, typ):

    if typ in common.ftypes:
        return 'return nsimd_sub_{simd_ext}_{typ}({in0}, {in1});'. \
               format(**fmtspec)

    if typ in ('i8', 'i16', 'u8', 'u16'):
        return adds_subs_intrinsic_instructions_i8_i16_u8_u16(
                   'subs', simd_ext, typ)

    if typ in common.itypes:
        return 'return nsimd_adds_{simd_ext}_{typ}({in0}, ' \
               'nsimd_neg_{simd_ext}_{typ}({in1}));'.format(**fmtspec)

    min_ = common.limits[typ]['min']

    return \
    '''/* Algo pseudo code: */

       /* unsigned only */
       /* a > 0; b > 0 ==> a - b --> possibility for underflow only */
       /* if b > a --> underflow */

       const nsimd_{simd_ext}_v{typ} ures =
           nsimd_sub_{simd_ext}_{typ}({in0}, {in1});
       const nsimd_{simd_ext}_vl{typ} is_underflow =
           nsimd_gt_{simd_ext}_{typ}({in1}, {in0});
       const nsimd_{simd_ext}_v{typ} umin =
           nsimd_set1_{simd_ext}_{typ}(({typ}){min_});
       return nsimd_if_else1_{simd_ext}_{typ}(is_underflow, umin, ures);'''. \
       format(min_=min_, **fmtspec)

# -----------------------------------------------------------------------------
# to_mask

def to_mask1(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = nsimd_to_mask_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_to_mask_{simd_ext}_f32({in0}.v1);
                  return ret;'''.format(**fmtspec)
    if simd_ext in sse + avx:
        return 'return {in0};'.format(**fmtspec)
    elif simd_ext == 'avx512_skylake':
        if typ in common.iutypes:
            return 'return _mm512_movm_epi{typnbits}({in0});'. \
                   format(**fmtspec)
        elif typ in ['f32', 'f64']:
            return '''return _mm512_castsi512{suf}(
                               _mm512_movm_epi{typnbits}({in0}));'''. \
                               format(**fmtspec)
    else:
        if typ in ['i32', 'u32', 'i64', 'u64']:
            return '''return _mm512_mask_mov{suf}(_mm512_setzero_si512(),
                                 {in0}, _mm512_set1_epi32(-1));'''. \
                                 format(**fmtspec)
        elif typ in ['f32', 'f64']:
            return '''return _mm512_mask_mov{suf}(_mm512_castsi512{suf}(
                               _mm512_setzero_si512()), {in0},
                                 _mm512_castsi512{suf}(
                                   _mm512_set1_epi32(-1)));'''. \
                                   format(**fmtspec)
        else:
            return '''{typ} buf[{le}];
                      int i;
                      for (i = 0; i < {le}; i++) {{
                        if (({in0} >> i) & 1) {{
                          buf[i] = ({typ})-1;
                        }} else {{
                          buf[i] = ({typ})0;
                        }}
                      }}
                      return _mm512_loadu_si512(buf);'''.format(**fmtspec)

# -----------------------------------------------------------------------------
# to_logical

def to_logical1(simd_ext, typ):
    if typ in common.iutypes:
        return '''return nsimd_ne_{simd_ext}_{typ}(
                           {in0}, {pre}setzero{sufsi}());'''.format(**fmtspec)
    elif typ in ['f32', 'f64']:
        return '''return nsimd_reinterpretl_{simd_ext}_{typ}_{utyp}(
                           nsimd_ne_{simd_ext}_{utyp}(
                             {pre}cast{suf2}_si{nbits}({in0}),
                               {pre}setzero_si{nbits}()));'''. \
                               format(suf2=suf_si(simd_ext, typ)[1:],
                                      utyp='u{}'.format(fmtspec['typnbits']),
                                      **fmtspec)
    else:
        return '''nsimd_{simd_ext}_vlf16 ret;
                  ret.v0 = nsimd_to_logical_{simd_ext}_f32({in0}.v0);
                  ret.v1 = nsimd_to_logical_{simd_ext}_f32({in0}.v1);
                  return ret;'''.format(**fmtspec)

# -----------------------------------------------------------------------------
# zip functions

def zip_half(func, simd_ext, typ):
    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'
    if simd_ext in sse:
        if typ == 'f16':
            return '''nsimd_{simd_ext}_v{typ} ret;
                      ret.v0 = _mm_unpacklo_ps({in0}.v{k}, {in1}.v{k});
                      ret.v1 = _mm_unpackhi_ps({in0}.v{k}, {in1}.v{k});
                      return ret;'''. \
                      format(k='0' if func == 'ziplo' else '1', **fmtspec)
        else:
            return 'return {pre}unpack{lo}{suf}({in0}, {in1});'. \
                   format(lo='lo' if func == 'ziplo' else 'hi', **fmtspec)
    elif simd_ext in avx:
        # Currently, 256 and 512 bits vectors are splitted into 128 bits
        # vectors in order to perform the ziplo/hi operation using the
        # unpacklo/hi sse operations.
        if typ == 'f16':
            in0vk = '{in0}.v{k}'.format(k='0' if func == 'ziplo' else '1',
                                        **fmtspec)
            in1vk = '{in1}.v{k}'.format(k='0' if func == 'ziplo' else '1',
                                        **fmtspec)
            return \
            '''nsimd_{simd_ext}_v{typ} ret;
               __m128 v_tmp0 = {get_low_in0vk};
               __m128 v_tmp1 = {get_low_in1vk};
               __m128 v_tmp2 = {get_high_in0vk};
               __m128 v_tmp3 = {get_high_in1vk};
               __m128 vres_lo0 = _mm_unpacklo_ps(v_tmp0, v_tmp1);
               __m128 vres_hi0 = _mm_unpackhi_ps(v_tmp0, v_tmp1);
               ret.v0 = {merge0};
               __m128 vres_lo1 = _mm_unpacklo_ps(v_tmp2, v_tmp3);
               __m128 vres_hi1 = _mm_unpackhi_ps(v_tmp2, v_tmp3);
               ret.v1 = {merge1};
               return ret;
               '''.format(get_low_in0vk=extract(simd_ext, 'f32', LO, in0vk),
                          get_low_in1vk=extract(simd_ext, 'f32', LO, in1vk),
                          get_high_in0vk=extract(simd_ext, 'f32', HI, in0vk),
                          get_high_in1vk=extract(simd_ext, 'f32', HI, in1vk),
                          merge0=setr(simd_ext, 'f32', 'vres_lo0', 'vres_hi0'),
                          merge1=setr(simd_ext, 'f32', 'vres_lo1', 'vres_hi1'),
                          **fmtspec)
        else:
            hl = LO if func == 'ziplo' else HI
            return \
            '''{nat} v_tmp0 = {half_in0};
               {nat} v_tmp1 = {half_in1};
               {nat} vres_lo = _mm_unpacklo{suf}(v_tmp0, v_tmp1);
               {nat} vres_hi = _mm_unpackhi{suf}(v_tmp0, v_tmp1);
               return {merge};
               '''.format(nat=get_native_typ(simd_ext2, typ),
                          half_in0=extract(simd_ext, typ, hl, common.in0),
                          half_in1=extract(simd_ext, typ, hl, common.in1),
                          merge=setr(simd_ext, typ, 'vres_lo', 'vres_hi'),
                          **fmtspec)
    else:
        if typ == 'f16':
            return \
            '''nsimd_{simd_ext}_v{typ} ret;
               __m512 v0 = {in0}.v{k};
               __m512 v1 = {in1}.v{k};
               __m256 v_tmp0, v_tmp1, vres_lo, vres_hi;
               /* Low part */
               v_tmp0 = {low_v0};
               v_tmp1 = {low_v1};
               vres_lo = nsimd_ziplo_avx2_f32(v_tmp0, v_tmp1);
               vres_hi = nsimd_ziphi_avx2_f32(v_tmp0, v_tmp1);
               ret.v0 = {merge};
               /* High part */
               v_tmp0 = {high_v0};
               v_tmp1 = {high_v1};
               vres_lo = nsimd_ziplo_avx2_f32(v_tmp0, v_tmp1);
               vres_hi = nsimd_ziphi_avx2_f32(v_tmp0, v_tmp1);
               ret.v1 = {merge};
               return ret;'''. \
               format(k='0' if func == 'ziplo' else '1',
                      low_v0=extract(simd_ext, 'f32', LO, 'v0'),
                      low_v1=extract(simd_ext, 'f32', LO, 'v1'),
                      high_v0=extract(simd_ext, 'f32', HI, 'v0'),
                      high_v1=extract(simd_ext, 'f32', HI, 'v1'),
                      merge=setr(simd_ext, 'f32', 'vres_lo', 'vres_hi'),
                      **fmtspec)
        else:
            hl = LO if func == 'ziplo' else HI
            return \
            '''{nat} v_tmp0, v_tmp1;
               v_tmp0 = {half_in0};
               v_tmp1 = {half_in1};
               {nat} vres_lo = nsimd_ziplo_avx2_{typ}(v_tmp0, v_tmp1);
               {nat} vres_hi = nsimd_ziphi_avx2_{typ}(v_tmp0, v_tmp1);
               return {merge};'''. \
               format(nat=get_native_typ(simd_ext2, typ),
                      half_in0=extract(simd_ext, typ, hl, common.in0),
                      half_in1=extract(simd_ext, typ, hl, common.in1),
                      merge=setr(simd_ext, typ, 'vres_lo', 'vres_hi'),
                      **fmtspec)

def zip(simd_ext, typ):
    return '''nsimd_{simd_ext}_v{typ}x2 ret;
              ret.v0 = nsimd_ziplo_{simd_ext}_{typ}({in0}, {in1});
              ret.v1 = nsimd_ziphi_{simd_ext}_{typ}({in0}, {in1});
              return ret;
              '''.format(**fmtspec)

# -----------------------------------------------------------------------------
# unzip functions

def unzip_half(opts, func, simd_ext, typ):
    loop = '''{typ} tab[{lex2}];
              {typ} res[{le}];
              int i;
              nsimd_storeu_{simd_ext}_{typ}(tab, {in0});
              nsimd_storeu_{simd_ext}_{typ}(tab + {le}, {in1});
              for(i = 0; i < {le}; i++) {{
                res[i] = tab[2 * i + {offset}];
              }}
              return nsimd_loadu_{simd_ext}_{typ}(res);
              '''.format(lex2=2 * int(fmtspec['le']),
                         offset='0' if func == 'unziplo' else '1', **fmtspec)

    if simd_ext in sse:
        if typ in ['f32', 'i32', 'u32']:
            v0 = ('_mm_castsi128_ps({in0})' if typ in ['i32', 'u32'] \
                                            else '{in0}').format(**fmtspec)
            v1 = ('_mm_castsi128_ps({in1})' if typ in ['i32', 'u32'] \
                                            else '{in1}').format(**fmtspec)
            ret = ('_mm_castps_si128(v_res)' if typ in ['i32', 'u32'] \
                                             else 'v_res').format(**fmtspec)
            return '''__m128 v_res;
                      v_res = _mm_shuffle_ps({v0}, {v1}, {mask});
                      return {ret};'''.format(
                      mask='_MM_SHUFFLE(2, 0, 2, 0)' if func == 'unziplo' \
                      else '_MM_SHUFFLE(3, 1, 3, 1)',
                      v0=v0, v1=v1, ret=ret, **fmtspec)
        elif typ == 'f16':
            return \
            '''nsimd_{simd_ext}_v{typ} v_res;
               v_res.v0 = _mm_shuffle_ps({in0}.v0, {in0}.v1, {mask});
               v_res.v1 = _mm_shuffle_ps({in1}.v0, {in1}.v1, {mask});
               return v_res;'''.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \
                                       if func == 'unziplo' \
                                       else '_MM_SHUFFLE(3, 1, 3, 1)',
                                       **fmtspec)
        elif typ in ['f64', 'i64', 'u64']:
            v0 = ('_mm_castsi128_pd({in0})' if typ in ['i64', 'u64'] \
                                            else '{in0}').format(**fmtspec)
            v1 = ('_mm_castsi128_pd({in1})' if typ in ['i64', 'u64'] \
                                            else '{in1}').format(**fmtspec)
            ret = ('_mm_castpd_si128(v_res)' if typ in ['i64', 'u64'] \
                                             else 'v_res').format(**fmtspec)
            return '''__m128d v_res;
                      v_res = _mm_shuffle_pd({v0}, {v1}, {mask});
                      return {ret};
                      '''.format(mask='0' if func == 'unziplo' else '3',
                                 v0=v0, v1=v1, ret=ret, **fmtspec)
        elif typ in ['i16', 'u16']:
            return '''__m128i v_tmp0 = _mm_shufflelo_epi16(
                                           {in0}, _MM_SHUFFLE(3, 1, 2, 0));
                      v_tmp0 = _mm_shufflehi_epi16(v_tmp0,
                                   _MM_SHUFFLE(3, 1, 2, 0));
                      __m128i v_tmp1 = _mm_shufflelo_epi16({in1},
                                   _MM_SHUFFLE(3, 1, 2, 0));
                      v_tmp1 = _mm_shufflehi_epi16(v_tmp1,
                                   _MM_SHUFFLE(3, 1, 2, 0));
                      __m128 v_res = _mm_shuffle_ps(_mm_castsi128_ps(v_tmp0),
                                         _mm_castsi128_ps(v_tmp1), {mask});
                      return _mm_castps_si128(v_res);
                      '''.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \
                                 if func == 'unziplo' \
                                 else '_MM_SHUFFLE(3, 1, 3, 1)', **fmtspec)
        else:
            return loop
    elif simd_ext in avx:
        ret_template = \
        '''v_tmp0 = _mm256_permute2f128_{t}({v0}, {v0}, 0x01);
           v_tmp0 = _mm256_shuffle_{t}({v0}, v_tmp0, {mask});
           v_tmp1 = _mm256_permute2f128_{t}({v1}, {v1}, 0x01);
           v_tmp1 = _mm256_shuffle_{t}({v1}, v_tmp1, {mask});
           v_res  = _mm256_permute2f128_{t}(v_tmp0, v_tmp1, 0x20);
           {ret} = {v_res};'''
        if typ in ['f32', 'i32', 'u32']:
            v0 = '_mm256_castsi256_ps({in0})' \
                 if typ in ['i32', 'u32'] else '{in0}'
            v1 = '_mm256_castsi256_ps({in1})' \
                 if typ in ['i32', 'u32'] else '{in1}'
            v_res = '_mm256_castps_si256(v_res)' \
                    if typ in ['i32', 'u32'] else 'v_res'
            ret = 'ret'
            src = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \
                      if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)',
                      v0=v0, v1=v1, v_res=v_res, ret=ret, t='ps', **fmtspec)
            return '''nsimd_{simd_ext}_v{typ} ret;
                      __m256 v_res, v_tmp0, v_tmp1;
                      {src}
                      return ret;'''. \
                      format(src=src.format(**fmtspec), **fmtspec)
        elif typ == 'f16':
            src0 = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \
                       if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)',
                       v0='{in0}.v0', v1='{in0}.v1', v_res='v_res',
                       ret='ret.v0', t='ps')
            src1 = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \
                       if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)',
                       v0='{in1}.v0', v1='{in1}.v1', v_res='v_res',
                       ret='ret.v1', t='ps')
            return '''nsimd_{simd_ext}_v{typ} ret;
                      __m256 v_res, v_tmp0, v_tmp1;
                      {src0}
                      {src1}
                      return ret;'''.format(src0=src0.format(**fmtspec),
                                            src1=src1.format(**fmtspec),
                                            **fmtspec)
        elif typ in ['f64', 'i64', 'u64']:
            v0 = ('_mm256_castsi256_pd({in0})' \
                      if typ in ['i64', 'u64'] else '{in0}').format(**fmtspec)
            v1 = ('_mm256_castsi256_pd({in1})' \
                      if typ in ['i64', 'u64'] else '{in1}').format(**fmtspec)
            v_res = ('_mm256_castpd_si256(v_res)' \
                         if typ in ['i64', 'u64'] else 'v_res'). \
                         format(**fmtspec)
            src = ret_template.format(mask='0x00' if func == 'unziplo' \
                      else '0x03', v0=v0, v1=v1, ret='ret', v_res=v_res,
                      t='pd')
            return '''nsimd_{simd_ext}_v{typ} ret;
                      __m256d v_res, v_tmp0, v_tmp1;
                      {src}
                      return ret;'''.format(src=src.format(**fmtspec),
                                            **fmtspec)
        elif typ in ['i16', 'u16']:
            return \
            '''__m128i v_tmp0_hi = {hi0};
               __m128i v_tmp0_lo = {lo0};
               __m128i v_tmp1_hi = {hi1};
               __m128i v_tmp1_lo = {lo1};
               v_tmp0_lo = nsimd_{func}_sse2_{typ}(v_tmp0_lo, v_tmp0_hi);
               v_tmp1_lo = nsimd_{func}_sse2_{typ}(v_tmp1_lo, v_tmp1_hi);
               return {merge};'''. \
               format(hi0=extract(simd_ext, typ, HI, common.in0),
                      lo0=extract(simd_ext, typ, LO, common.in0),
                      hi1=extract(simd_ext, typ, HI, common.in1),
                      lo1=extract(simd_ext, typ, LO, common.in1),
                      merge=setr(simd_ext, typ, 'v_tmp0_lo', 'v_tmp1_lo'),
                      func=func, **fmtspec)
        else:
            return loop
    else:
        if typ == 'f16':
            return \
            '''nsimd_{simd_ext}_v{typ} ret;
               __m256 v_tmp0, v_tmp1, v_res_lo, v_res_hi;
               v_tmp0 = {loin0v0};
               v_tmp1 = {hiin0v0};
               v_res_lo = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);
               v_tmp0 = {loin0v1};
               v_tmp1 = {hiin0v1};
               v_res_hi = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);
               ret.v0 = {merge};
               v_tmp0 = {loin1v0};
               v_tmp1 = {hiin1v0};
               v_res_lo = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);
               v_tmp0 = {loin1v1};
               v_tmp1 = {hiin1v1};
               v_res_hi = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);
               ret.v1 = {merge};
               return ret;'''.format(
                   loin0v0=extract(simd_ext, 'f32', LO, common.in0 + '.v0'),
                   hiin0v0=extract(simd_ext, 'f32', HI, common.in0 + '.v0'),
                   loin0v1=extract(simd_ext, 'f32', LO, common.in0 + '.v1'),
                   hiin0v1=extract(simd_ext, 'f32', HI, common.in0 + '.v1'),
                   loin1v0=extract(simd_ext, 'f32', LO, common.in1 + '.v0'),
                   hiin1v0=extract(simd_ext, 'f32', HI, common.in1 + '.v0'),
                   loin1v1=extract(simd_ext, 'f32', LO, common.in1 + '.v1'),
                   hiin1v1=extract(simd_ext, 'f32', HI, common.in1 + '.v1'),
                   merge=setr(simd_ext, 'f32', 'v_res_lo', 'v_res_hi'),
                   func=func, **fmtspec)
        else:
            return '''nsimd_avx2_v{typ} v00 = {extract_lo0};
                      nsimd_avx2_v{typ} v01 = {extract_hi0};
                      nsimd_avx2_v{typ} v10 = {extract_lo1};
                      nsimd_avx2_v{typ} v11 = {extract_hi1};
                      v00 = nsimd_{func}_avx2_{typ}(v00, v01);
                      v01 = nsimd_{func}_avx2_{typ}(v10, v11);
                      return {merge};'''.format(
                          func=func,
                          extract_lo0=extract(simd_ext, typ, LO, common.in0),
                          extract_lo1=extract(simd_ext, typ, LO, common.in1),
                          extract_hi0=extract(simd_ext, typ, HI, common.in0),
                          extract_hi1=extract(simd_ext, typ, HI, common.in1),
                          merge=setr(simd_ext, typ, 'v00', 'v01'), **fmtspec)

def unzip(simd_ext, typ):
    return '''nsimd_{simd_ext}_v{typ}x2 ret;
              ret.v0 = nsimd_unziplo_{simd_ext}_{typ}({in0}, {in1});
              ret.v1 = nsimd_unziphi_{simd_ext}_{typ}({in0}, {in1});
              return ret;'''.format(**fmtspec)

# -----------------------------------------------------------------------------
# mask_for_loop_tail

def mask_for_loop_tail(simd_ext, typ):
    if typ == 'f16':
        fill_n = '''n.v0 = {pre}set1_ps((f32)({in1} - {in0}));
                    n.v1 = n.v0;'''.format(**fmtspec)
    else:
        fill_n = 'n = nsimd_set1_{simd_ext}_{typ}(({typ})({in1} - {in0}));'. \
                 format(**fmtspec)
    return '''if ({in0} >= {in1}) {{
                return nsimd_set1l_{simd_ext}_{typ}(0);
              }}
              if ({in1} - {in0} < {le}) {{
                nsimd_{simd_ext}_v{typ} n;
                {fill_n}
                return nsimd_lt_{simd_ext}_{typ}(
                         nsimd_iota_{simd_ext}_{typ}(), n);
              }} else {{
                return nsimd_set1l_{simd_ext}_{typ}(1);
              }}'''.format(fill_n=fill_n, **fmtspec)

# -----------------------------------------------------------------------------
# iota

def iota(simd_ext, typ):
    typ2 = 'f32' if typ == 'f16' else typ
    iota = ', '.join(['({typ2}){i}'.format(typ2=typ2, i=i) \
                      for i in range(int(fmtspec['le']))])
    if typ == 'f16':
        return '''f32 buf[{le}] = {{ {iota} }};
                  nsimd_{simd_ext}_vf16 ret;
                  ret.v0 = {pre}loadu_ps(buf);
                  ret.v1 = {pre}loadu_ps(buf + {le2});
                  return ret;'''. \
                  format(iota=iota, le2=fmtspec['le'] // 2, **fmtspec)
    return '''{typ} buf[{le}] = {{ {iota} }};
              return {pre}loadu{sufsi}({cast}buf);'''. \
              format(iota=iota, cast='(__m{nbits}i*)'.format(**fmtspec) \
                                if typ in common.iutypes else '', **fmtspec)

# -----------------------------------------------------------------------------
# scatter

def scatter(simd_ext, typ):
    if typ == 'f16':
        return '''int i;
                  f32 buf[{le}];
                  i16 offset_buf[{le}];
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});
                  {pre}storeu_ps(buf, {in2}.v0);
                  {pre}storeu_ps(buf + {leo2}, {in2}.v1);
                  for (i = 0; i < {le}; i++) {{
                    {in0}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]);
                  }}'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec)
    if simd_ext in (sse + avx) or typ in ['i8', 'u8', 'i16', 'u16']:
        cast = castsi(simd_ext, typ)
        return '''int i;
                  {typ} buf[{le}];
                  {ityp} offset_buf[{le}];
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});
                  {pre}storeu{sufsi}({cast}buf, {in2});
                  for (i = 0; i < {le}; i++) {{
                    {in0}[offset_buf[i]] = buf[i];
                  }}'''.format(ityp='i' + typ[1:], cast=cast, **fmtspec)
    # getting here means 32 and 64-bits types for avx512
    return '''{pre}i{typnbits}scatter{suf}(
                  (void *){in0}, {in1}, {in2}, {scale});'''. \
                  format(scale=int(typ[1:]) // 8, **fmtspec)

# -----------------------------------------------------------------------------
# linear scatter

def scatter_linear(simd_ext, typ):
    if typ == 'f16':
        return '''int i;
                  f32 buf[{le}];
                  {pre}storeu_ps(buf, {in2}.v0);
                  {pre}storeu_ps(buf + {leo2}, {in2}.v1);
                  for (i = 0; i < {le}; i++) {{
                    {in0}[i * {in1}] = nsimd_f32_to_f16(buf[i]);
                  }}'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec)
    if simd_ext in avx512:
        return '''nsimd_scatter_linear_avx2_{typ}({in0}, {in1}, {lo});
                  nsimd_scatter_linear_avx2_{typ}({in0} + ({leo2} * {in1}),
                                                  {in1}, {hi});'''. \
                  format(leo2=int(fmtspec['le']) // 2,
                         lo=extract(simd_ext, typ, LO, fmtspec['in2']),
                         hi=extract(simd_ext, typ, HI, fmtspec['in2']),
                         **fmtspec)
    emulation = '''int i;
                   {typ} buf[{le}];
                   {pre}storeu{sufsi}({cast}buf, {in2});
                   for (i = 0; i < {le}; i++) {{
                     {in0}[i * {in1}] = buf[i];
                   }}'''.format(cast=castsi(simd_ext, typ), **fmtspec)
    if (simd_ext == 'sse2' and typ in ['i16', 'u16']) or \
       (simd_ext == 'avx' and \
        typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']) or \
       (simd_ext in ['sse42', 'avx2']):
        trick = '\n'.join([
        '{in0}[{i} * {in1}] = {get_lane};'.format(i=i,
        get_lane=get_lane(simd_ext, typ, '{in2}'.format(**fmtspec), i),
        **fmtspec) for i in range(int(fmtspec['le']))])
        return '''#if NSIMD_WORD_SIZE == 32
                    {}
                  #else
                    {}
                  #endif'''.format(emulation, trick)
    else:
        return emulation

# -----------------------------------------------------------------------------
# mask_scatter

def mask_scatter(simd_ext, typ):
    if typ == 'f16':
        le2 = fmtspec['le'] // 2
        if simd_ext in sse + avx:
            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);
                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \
                            format(le2=le2, **fmtspec)
        else:
            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(
                              {in0}.v0, _mm512_set1_ps(1.0f)));
                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(
                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \
                            format(le2=le2, **fmtspec)
        return '''int i;
                  f32 mask[{le}], buf[{le}];
                  i16 offset_buf[{le}];
                  {store_mask}
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});
                  {pre}storeu_ps(buf, {in3}.v0);
                  {pre}storeu_ps(buf + {le2}, {in3}.v1);
                  for (i = 0; i < {le}; i++) {{
                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{
                      {in1}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]);
                    }}
                  }}'''.format(le2=le2, store_mask=store_mask, **fmtspec)
    if simd_ext in (sse + avx) or typ in ['i8', 'u8', 'i16', 'u16']:
        cast = castsi(simd_ext, typ)
        if simd_ext in avx512:
            mask_decl = 'u64 mask;'
            store_mask = 'mask = (u64){in0};'.format(**fmtspec)
            cond = '(mask >> i) & 1'
        else:
            mask_decl = '{typ} mask[{le}];'.format(**fmtspec)
            store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \
                         format(cast=cast, **fmtspec)
            cond = 'nsimd_scalar_reinterpret_{utyp}_{typ}(mask[i]) != '\
                   '({utyp})0'.format(utyp='u' + typ[1:], **fmtspec)
        return '''int i;
                  {typ} buf[{le}];
                  {mask_decl}
                  {ityp} offset_buf[{le}];
                  {store_mask}
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});
                  {pre}storeu{sufsi}({cast}buf, {in3});
                  for (i = 0; i < {le}; i++) {{
                    if ({cond}) {{
                      {in1}[offset_buf[i]] = buf[i];
                    }}
                  }}'''.format(ityp='i' + typ[1:], cast=cast, cond=cond,
                               mask_decl=mask_decl, store_mask=store_mask,
                               **fmtspec)
    # getting here means 32 and 64-bits types for avx512
    return '''{pre}mask_i{typnbits}scatter{suf}(
                  (void *){in1}, {in0}, {in2}, {in3}, {scale});'''. \
                  format(scale=int(typ[1:]) // 8, **fmtspec)

# -----------------------------------------------------------------------------
# gather

def gather(simd_ext, typ):
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  int i;
                  f32 buf[{le}];
                  i16 offset_buf[{le}];
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});
                  for (i = 0; i < {le}; i++) {{
                    buf[i] = nsimd_f16_to_f32({in0}[offset_buf[i]]);
                  }}
                  ret.v0 = {pre}loadu_ps(buf);
                  ret.v1 = {pre}loadu_ps(buf + {leo2});
                  return ret;'''.format(leo2=int(fmtspec['le']) // 2,
                                        **fmtspec)
    if simd_ext in (sse + ['avx']) or typ in ['i8', 'u8', 'i16', 'u16']:
        cast = castsi(simd_ext, typ)
        return '''int i;
                  {typ} buf[{le}];
                  {ityp} offset_buf[{le}];
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});
                  for (i = 0; i < {le}; i++) {{
                    buf[i] = {in0}[offset_buf[i]];
                  }}
                  return {pre}loadu{sufsi}({cast}buf);'''. \
                  format(ityp='i' + typ[1:], cast=cast, **fmtspec)
    # getting here means 32 and 64-bits types for avx2 and avx512
    if simd_ext == 'avx2':
        if typ in ['i64', 'u64']:
            cast = '(nsimd_longlong *)'
        elif typ in ['i32', 'u32']:
            cast = '(int *)'
        else:
            cast = '({typ} *)'.format(**fmtspec)
        return '''return {pre}i{typnbits}gather{suf}(
                             {cast}{in0}, {in1}, {scale});'''. \
                             format(scale=int(typ[1:]) // 8, cast=cast,
                                    **fmtspec)
    elif simd_ext in avx512:
        return 'return {pre}i{typnbits}gather{suf}({in1}, ' \
                      '(const void *){in0}, {scale});'. \
                      format(scale=int(typ[1:]) // 8, **fmtspec)

# -----------------------------------------------------------------------------
# linear gather

def gather_linear(simd_ext, typ):
    le = int(fmtspec['le'])
    cast = castsi(simd_ext, typ)
    if typ == 'f16':
        return '''nsimd_{simd_ext}_vf16 ret;
                  f32 buf[{le}];
                  int i;
                  for (i = 0; i < {le}; i++) {{
                    buf[i] = nsimd_f16_to_f32({in0}[i * {in1}]);
                  }}
                  ret.v0 = {pre}loadu_ps(buf);
                  ret.v1 = {pre}loadu_ps(buf + {leo2});
                  return ret;'''.format(leo2=le // 2, **fmtspec)
    emulation = '''{typ} buf[{le}];
                   int i;
                   for (i = 0; i < {le}; i++) {{
                     buf[i] = {in0}[i * {in1}];
                   }}
                   return {pre}loadu{sufsi}({cast}buf);'''. \
                   format(cast=cast, **fmtspec)
    if simd_ext == 'sse2' and typ not in ['i16', 'u16']:
        return emulation
    if simd_ext in sse + avx:
        trick = \
        '''nsimd_{simd_ext}_v{typ} ret;
           ret = {pre}undefined{sufsi}();
           '''.format(**fmtspec) + ''.join([
           set_lane(simd_ext, typ, 'ret', '{in0}[{i} * {in1}]'. \
                                          format(i=i, **fmtspec), i) + '\n' \
                                          for i in range(le)]) + \
        '''return ret;'''
        return '''#if NSIMD_WORD_SIZE == 32
                    {}
                  #else
                    {}
                  #endif
                  '''.format(emulation, trick)
    # getting here means AVX-512
    return \
    '''nsimd_avx2_v{typ} lo = _mm256_undefined{sufsi2}();
       nsimd_avx2_v{typ} hi = _mm256_undefined{sufsi2}();
       lo = nsimd_gather_linear_avx2_{typ}({in0}, {in1});
       hi = nsimd_gather_linear_avx2_{typ}({in0} + ({leo2} * {in1}), {in1});
       return {merge};'''.format(merge=setr(simd_ext, typ, 'lo', 'hi'),
                                 sufsi2=suf_si('avx2', typ),
                                 leo2=le // 2, **fmtspec)

# -----------------------------------------------------------------------------
# maksed gather

def maskoz_gather(oz, simd_ext, typ):
    if typ == 'f16':
        le2 = fmtspec['le'] // 2
        if simd_ext in sse + avx:
            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);
                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \
                            format(le2=le2, **fmtspec)
        else:
            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(
                              {in0}.v0, _mm512_set1_ps(1.0f)));
                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(
                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \
                            format(le2=le2, **fmtspec)
        if oz == 'z':
            store_oz = '''{pre}storeu_ps(buf, {pre}setzero_ps());
                          {pre}storeu_ps(buf + {le2}, {pre}setzero_ps());'''. \
                          format(le2=le2, **fmtspec)
        else:
            store_oz = '''{pre}storeu_ps(buf, {in3}.v0);
                          {pre}storeu_ps(buf + {le2}, {in3}.v1);'''. \
                          format(le2=le2, **fmtspec)
        return '''nsimd_{simd_ext}_vf16 ret;
                  int i;
                  f32 buf[{le}], mask[{le}];
                  i16 offset_buf[{le}];
                  {store_mask}
                  {store_oz}
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});
                  for (i = 0; i < {le}; i++) {{
                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{
                      buf[i] = nsimd_f16_to_f32({in1}[offset_buf[i]]);
                    }}
                  }}
                  ret.v0 = {pre}loadu_ps(buf);
                  ret.v1 = {pre}loadu_ps(buf + {leo2});
                  return ret;'''.format(leo2=le2, store_mask=store_mask,
                                        store_oz=store_oz, **fmtspec)
    if simd_ext in (sse + ['avx']) or typ in ['i8', 'u8', 'i16', 'u16']:
        cast = castsi(simd_ext, typ)
        if simd_ext in sse + avx:
            mask_decl = '{typ} mask[{le}];'.format(**fmtspec)
            store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \
                         format(cast=cast, **fmtspec)
            if typ in common.iutypes:
                comp = 'mask[i]'
            else:
                comp = 'nsimd_scalar_reinterpret_u{typnbits}_{typ}(mask[i])'. \
                       format(**fmtspec)
        else:
            mask_decl = 'u64 mask;'
            store_mask = 'mask = (u64){in0};'.format(**fmtspec)
            comp = '(mask >> i) & 1'
        if oz == 'z':
            store_oz = '''{pre}storeu{sufsi}({cast}buf,
                                             {pre}setzero{sufsi}());'''. \
                                             format(cast=cast, **fmtspec)
        else:
            store_oz = '{pre}storeu{sufsi}({cast}buf, {in3});'. \
                       format(cast=cast, **fmtspec)
        return '''int i;
                  {typ} buf[{le}];
                  {mask_decl}
                  {ityp} offset_buf[{le}];
                  {store_mask}
                  {store_oz}
                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});
                  for (i = 0; i < {le}; i++) {{
                    if ({comp}) {{
                      buf[i] = {in1}[offset_buf[i]];
                    }}
                  }}
                  return {pre}loadu{sufsi}({cast}buf);'''. \
                  format(ityp='i' + typ[1:], cast=cast, store_mask=store_mask,
                         store_oz=store_oz, comp=comp, mask_decl=mask_decl,
                         **fmtspec)
    # getting here means 32 and 64-bits types for avx2 and avx512
    if oz == 'o':
        src = '{in3}'.format(**fmtspec)
    else:
        src = '{pre}setzero{sufsi}()'.format(**fmtspec)
    if simd_ext == 'avx2':
        if typ in ['i64', 'u64']:
            cast = '(nsimd_longlong *)'
        elif typ in ['i32', 'u32']:
            cast = '(int *)'
        else:
            cast = '({typ} *)'.format(**fmtspec)
        return '''return {pre}mask_i{typnbits}gather{suf}({src},
                             {cast}{in1}, {in2}, {in0}, {scale});'''. \
                             format(scale=int(typ[1:]) // 8, cast=cast,
                                    src=src, **fmtspec)
    elif simd_ext in avx512:
        return 'return {pre}mask_i{typnbits}gather{suf}({src}, {in0}, ' \
                      '{in2}, (const void *){in1}, {scale});'. \
                      format(src=src, scale=int(typ[1:]) // 8, **fmtspec)


# -----------------------------------------------------------------------------
# get_impl function

def get_impl(opts, func, simd_ext, from_typ, to_typ):
    global fmtspec

    fmtspec = {
      'simd_ext': simd_ext,
      'typ': from_typ,
      'styp': get_native_typ(simd_ext, from_typ),
      'from_typ': from_typ,
      'to_typ': to_typ,
      'pre': pre(simd_ext),
      'suf': suf_ep(from_typ),
      'sufsi': suf_si(simd_ext, from_typ),
      'in0': common.in0,
      'in1': common.in1,
      'in2': common.in2,
      'in3': common.in3,
      'in4': common.in4,
      'in5': common.in5,
      'nbits': nbits(simd_ext),
      'le': int(nbits(simd_ext)) // int(from_typ[1:]),
      'typnbits': from_typ[1:]
    }

    impls = {
        'loada': lambda: load(simd_ext, from_typ, True),
        'masko_loada1': lambda: maskoz_load(simd_ext, from_typ, 'o', True),
        'maskz_loada1': lambda: maskoz_load(simd_ext, from_typ, 'z', True),
        'load2a': lambda: load_deg234(simd_ext, from_typ, True, 2),
        'load3a': lambda: load_deg234(simd_ext, from_typ, True, 3),
        'load4a': lambda: load_deg234(simd_ext, from_typ, True, 4),
        'loadu': lambda: load(simd_ext, from_typ, False),
        'masko_loadu1': lambda: maskoz_load(simd_ext, from_typ, 'o', False),
        'maskz_loadu1': lambda: maskoz_load(simd_ext, from_typ, 'z', False),
        'load2u': lambda: load_deg234(simd_ext, from_typ, False, 2),
        'load3u': lambda: load_deg234(simd_ext, from_typ, False, 3),
        'load4u': lambda: load_deg234(simd_ext, from_typ, False, 4),
        'storea': lambda: store(simd_ext, from_typ, True),
        'mask_storea1': lambda: mask_store(simd_ext, from_typ, True),
        'store2a': lambda: store_deg234(simd_ext, from_typ, True, 2),
        'store3a': lambda: store_deg234(simd_ext, from_typ, True, 3),
        'store4a': lambda: store_deg234(simd_ext, from_typ, True, 4),
        'storeu': lambda: store(simd_ext, from_typ, False),
        'mask_storeu1': lambda: mask_store(simd_ext, from_typ, False),
        'store2u': lambda: store_deg234(simd_ext, from_typ, False, 2),
        'store3u': lambda: store_deg234(simd_ext, from_typ, False, 3),
        'store4u': lambda: store_deg234(simd_ext, from_typ, False, 4),
        'gather': lambda: gather(simd_ext, from_typ),
        'gather_linear': lambda: gather_linear(simd_ext, from_typ),
        'masko_gather': lambda: maskoz_gather('o', simd_ext, from_typ),
        'maskz_gather': lambda: maskoz_gather('z', simd_ext, from_typ),
        'scatter': lambda: scatter(simd_ext, from_typ),
        'scatter_linear': lambda: scatter_linear(simd_ext, from_typ),
        'mask_scatter': lambda: mask_scatter(simd_ext, from_typ),
        'andb': lambda: binop2('andb', simd_ext, from_typ),
        'xorb': lambda: binop2('xorb', simd_ext, from_typ),
        'orb': lambda: binop2('orb', simd_ext, from_typ),
        'andl': lambda: binlop2('andl', simd_ext, from_typ),
        'xorl': lambda: binlop2('xorl', simd_ext, from_typ),
        'orl': lambda: binlop2('orl', simd_ext, from_typ),
        'notb': lambda: not1(simd_ext, from_typ),
        'notl': lambda: lnot1(simd_ext, from_typ),
        'andnotb': lambda: andnot2(simd_ext, from_typ),
        'andnotl': lambda: landnot2(simd_ext, from_typ),
        'add': lambda: addsub('add', simd_ext, from_typ),
        'sub': lambda: addsub('sub', simd_ext, from_typ),
        'adds': lambda: adds(simd_ext, from_typ),
        'subs': lambda: subs(simd_ext, from_typ),
        'div': lambda: div2(opts, simd_ext, from_typ),
        'sqrt': lambda: sqrt1(simd_ext, from_typ),
        'len': lambda: len1(simd_ext, from_typ),
        'mul': lambda: mul2(opts, simd_ext, from_typ),
        'shl': lambda: shl_shr('shl', simd_ext, from_typ),
        'shr': lambda: shl_shr('shr', simd_ext, from_typ),
        'shra': lambda: shra(opts, simd_ext, from_typ),
        'set1': lambda: set1(simd_ext, from_typ),
        'set1l': lambda: set1l(simd_ext, from_typ),
        'eq': lambda: eq2(simd_ext, from_typ),
        'ne': lambda: neq2(simd_ext, from_typ),
        'gt': lambda: gt2(simd_ext, from_typ),
        'lt': lambda: lt2(simd_ext, from_typ),
        'ge': lambda: geq2(simd_ext, from_typ),
        'le': lambda: leq2(simd_ext, from_typ),
        'if_else1': lambda: if_else1(simd_ext, from_typ),
        'min': lambda: minmax('min', simd_ext, from_typ),
        'max': lambda: minmax('max', simd_ext, from_typ),
        'loadla': lambda: loadl(simd_ext, from_typ, True),
        'loadlu': lambda: loadl(simd_ext, from_typ, False),
        'storela': lambda: storel(simd_ext, from_typ, True),
        'storelu': lambda: storel(simd_ext, from_typ, False),
        'abs': lambda: abs1(simd_ext, from_typ),
        'fma': lambda: fma_fms('fma', simd_ext, from_typ),
        'fnma': lambda: fma_fms('fnma', simd_ext, from_typ),
        'fms': lambda: fma_fms('fms', simd_ext, from_typ),
        'fnms': lambda: fma_fms('fnms', simd_ext, from_typ),
        'ceil': lambda: round1(opts, 'ceil', simd_ext, from_typ),
        'floor': lambda: round1(opts, 'floor', simd_ext, from_typ),
        'trunc': lambda: trunc1(opts, simd_ext, from_typ),
        'round_to_even': lambda: round_to_even1(opts, simd_ext, from_typ),
        'all': lambda: all_any('all', simd_ext, from_typ),
        'any': lambda: all_any('any', simd_ext, from_typ),
        'reinterpret': lambda: reinterpret1(simd_ext, from_typ, to_typ),
        'reinterpretl': lambda: reinterpretl1(simd_ext, from_typ, to_typ),
        'cvt': lambda: convert1(simd_ext, from_typ, to_typ),
        'rec11': lambda: rec11_rsqrt11('rcp', simd_ext, from_typ),
        'rec8': lambda: rec11_rsqrt11('rcp', simd_ext, from_typ),
        'rsqrt11': lambda: rec11_rsqrt11('rsqrt', simd_ext, from_typ),
        'rsqrt8': lambda: rec11_rsqrt11('rsqrt', simd_ext, from_typ),
        'rec': lambda: rec1(simd_ext, from_typ),
        'neg': lambda: neg1(simd_ext, from_typ),
        'nbtrue': lambda: nbtrue1(simd_ext, from_typ),
        'reverse': lambda: reverse1(simd_ext, from_typ),
        'addv': lambda: addv(simd_ext, from_typ),
        'upcvt': lambda: upcvt1(simd_ext, from_typ, to_typ),
        'downcvt': lambda: downcvt1(opts, simd_ext, from_typ, to_typ),
        'to_mask': lambda: to_mask1(simd_ext, from_typ),
        'to_logical': lambda: to_logical1(simd_ext, from_typ),
        'ziplo': lambda: zip_half('ziplo', simd_ext, from_typ),
        'ziphi': lambda: zip_half('ziphi', simd_ext, from_typ),
        'unziplo': lambda: unzip_half(opts, 'unziplo', simd_ext, from_typ),
        'unziphi': lambda: unzip_half(opts, 'unziphi', simd_ext, from_typ),
        'zip' : lambda : zip(simd_ext, from_typ),
        'unzip' : lambda : unzip(simd_ext, from_typ),
        'mask_for_loop_tail': lambda : mask_for_loop_tail(simd_ext, from_typ),
        'iota': lambda : iota(simd_ext, from_typ)
    }
    if simd_ext not in get_simd_exts():
        raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
    if not from_typ in common.types:
        raise ValueError('Unknown type "{}"'.format(from_typ))
    if not func in impls:
        return common.NOT_IMPLEMENTED
    else:
        return impls[func]()


================================================
FILE: egg/rocm.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import cuda

# -----------------------------------------------------------------------------

def get_impl(operator, totyp, typ):
    return cuda.get_impl(operator, totyp, typ)


================================================
FILE: egg/scalar.py
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import common

fmtspec = dict()

# -----------------------------------------------------------------------------

def opnum(func, typ):
    normal = 'return ({typ})({func});'. \
             format(func=func.format(**fmtspec), **fmtspec)
    if typ == 'f16':
        return \
        '''#ifdef NSIMD_ARM_FP16
             {normal}
           #else
             return nsimd_f32_to_f16({func});
           #endif'''.format(normal=normal, func=func. \
           format(in0='nsimd_f16_to_f32({in0})',
                  in1='nsimd_f16_to_f32({in1})',
                  in2='nsimd_f16_to_f32({in2})').format(**fmtspec))
    else:
        return normal

# -----------------------------------------------------------------------------

def cmp(func, typ):
    normal = 'return ({func});'. \
             format(func=func.format(**fmtspec), **fmtspec)
    if typ == 'f16':
        return \
        '''#ifdef NSIMD_ARM_FP16
             {normal}
           #else
             return ({func});
           #endif'''.format(normal=normal, func=func. \
           format(in0='nsimd_f16_to_f32({in0})',
                  in1='nsimd_f16_to_f32({in1})',
                  in2='nsimd_f16_to_f32({in2})').format(**fmtspec))
    else:
        return normal

# -----------------------------------------------------------------------------

def opbit(func, typ):
    in0 = '{in0}'.format(**fmtspec) if typ in common.utypes else \
          'nsimd_scalar_reinterpret_u{typnbits}_{typ}({in0})'.format(**fmtspec)
    in1 = '{in1}'.format(**fmtspec) if typ in common.utypes else \
          'nsimd_scalar_reinterpret_u{typnbits}_{typ}({in1})'.format(**fmtspec)
    if typ in common.utypes:
        return 'return ({typ})({func});'. \
               format(func=func.format(in0=in0, in1=in1), **fmtspec)
    else:
        return '''return nsimd_scalar_reinterpret_{typ}_u{typnbits}(
                             (u{typnbits})({func}));'''.format(
                             func=func.format(in0=in0, in1=in1), **fmtspec)

# -----------------------------------------------------------------------------

def shift(func, typ):
    if func == 'shl':
        return 'return ({typ})({in0} << {in1});'.format(**fmtspec)
    # getting here means shr or shra
    if typ in common.utypes:
        return 'return ({typ})({in0} >> {in1});'.format(**fmtspec)
    # getting here means shr or shra on signed type
    utyp = common.bitfield_type[typ]
    if func == 'shr':
        return '''return nsimd_scalar_reinterpret_{typ}_{utyp}(
                           ({utyp})(nsimd_scalar_reinterpret_{utyp}_{typ}(
                             {in0}) >> {in1}));'''.format(utyp=utyp, **fmtspec)
    # getting here means shra on signed type
    return \
    '''if ({in1} == 0) {{
         return {in0};
       }}
       if ({in0} >= 0) {{
         return nsimd_scalar_reinterpret_{typ}_{utyp}(({utyp})(
                  nsimd_scalar_reinterpret_{utyp}_{typ}({in0}) >> {in1}));
       }} else {{
         {utyp} mask = ({utyp})((({utyp})-1) << ({typnbits} - {in1}));
         return nsimd_scalar_reinterpret_{typ}_{utyp}(({utyp})(mask |
                  ({utyp})(nsimd_scalar_reinterpret_{utyp}_{typ}(
                    {in0}) >> {in1})));
       }}'''.format(utyp=utyp, **fmtspec)

# -----------------------------------------------------------------------------

def libm_opn(func, arity, typ, until_cpp11, c89_code):
    cxx_version = '> 0' if not until_cpp11 else '>= 2011'
    comment = \
    '''/* {func} is not available in C89 but is given by POSIX 2001 */
       /* and C99. But we do not want to pollute the user includes  */
       /* and POSIX value if set so we play dirty.                  */'''. \
       format(func=func)
    args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \
                      for i in range(arity)])
    args_f16 = ', '.join(['nsimd_f16_to_f32({{in{}}})'.format(i). \
                          format(**fmtspec) for i in range(arity)])
    args_f64 = ', '.join(['(f64){{in{}}}'.format(i).format(**fmtspec) \
                          for i in range(arity)])
    args_f64_f16 = ', '.join(['(f64)nsimd_f16_to_f32({{in{}}})'.format(i). \
                              format(**fmtspec) for i in range(arity)])
    if typ == 'f16':
        c99_code = 'return nsimd_f32_to_f16({}f({}));'.format(func, args_f16)
        if c89_code == '':
            c89_code = 'return nsimd_f32_to_f16((f32){}({}));'. \
                       format(func, args_f64_f16)
        return \
        '''  {comment}
           #if defined(NSIMD_IS_MSVC) && _MSC_VER <= 1800 /* VS 2012 */
             {c89_code}
           #else
             #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \
                 _POSIX_C_SOURCE >= 200112L
               {c99_code}
             #else
               {c89_code}
             #endif
           #endif'''. \
           format(comment=comment, cxx_version=cxx_version, c89_code=c89_code,
                  c99_code=c99_code)
    elif typ == 'f32':
        c99_code = 'return {}f({});'.format(func, args)
        if c89_code == '':
            c89_code = 'return (f32){}({});'.format(func, args_f64)
        return \
        '''  {comment}
           #if defined(NSIMD_IS_MSVC) && _MSC_VER <= 1800 /* VS 2012 */
             {c89_code}
           #else
             #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \
                 _POSIX_C_SOURCE >= 200112L
               {c99_code}
             #else
               {c89_code}
             #endif
           #endif'''. \
           format(comment=comment, cxx_version=cxx_version, c89_code=c89_code,
                  c99_code=c99_code)
    else:
        normal = 'return {}({});'.format(func, args)
        if c89_code == '':
            return normal
        return \
        '''  {comment}
           #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \
               _POSIX_C_SOURCE >= 200112L
             {normal}
           #else
             {c89_code}
           #endif'''. \
           format(comment=comment, normal=normal, c89_code=c89_code,
                  cxx_version=cxx_version)

# -----------------------------------------------------------------------------

def round_to_even(typ):
    if typ in ['f32', 'f64']:
        return \
        '''{typ} fl = nsimd_scalar_floor_{typ}({in0});
           {typ} ce = nsimd_scalar_ceil_{typ}({in0});
           {typ} df = {in0} - fl; /* exactly representable in IEEE754 */
           {typ} dc = ce - {in0}; /* exactly representable in IEEE754 */
           if (df < dc) {{
             return fl;
           }} else if (df > dc) {{
             return ce;
           }} else {{
             {typ} fld2 = fl * 0.5{f}; /* exactly representable in IEEE754 */
             if (fld2 == nsimd_scalar_floor_{typ}(fld2)) {{
               return fl;
             }} else {{
               return ce;
             }}
           }}'''.format(f='f' if typ == 'f32' else '', **fmtspec)
    elif typ == 'f16':
        return \
        '''f32 in0 = nsimd_f16_to_f32({in0});
           f32 fl = nsimd_scalar_floor_f32(in0);
           f32 ce = nsimd_scalar_ceil_f32(in0);
           f32 df = in0 - fl; /* exactly representable in IEEE754 */
           f32 dc = ce - in0; /* exactly representable in IEEE754 */
           if (df < dc) {{
             return nsimd_f32_to_f16(fl);
           }} else if (df > dc) {{
             return nsimd_f32_to_f16(ce);
           }} else {{
             f32 fld2 = fl * 0.5f; /* exactly representable in IEEE754 */
             if (fld2 == nsimd_scalar_floor_f32(fld2)) {{
               return nsimd_f32_to_f16(fl);
             }} else {{
               return nsimd_f32_to_f16(ce);
             }}
           }}'''.format(**fmtspec)
    else:
        return 'return {in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------

def reinterpret(totyp, typ):
    if totyp == typ:
        return 'return {in0};'.format(**fmtspec)
    via_union = '''union {{ {typ} from; {totyp} to; }} buf;
                   buf.from = {in0};
                   return buf.to;'''.format(**fmtspec)
    via_memcpy = '''{totyp} ret;
                    memcpy((void *)&ret, (void *)&{in0}, sizeof(ret));
                    return ret;'''.format(**fmtspec)
    if typ == 'f16':
        if totyp == 'u16':
            emulated = 'return {in0}.u;'.format(**fmtspec)
        else:
            emulated = 'return nsimd_scalar_reinterpret_i16_u16({in0}.u);'. \
                       format(**fmtspec)
        return \
        '''#if defined(NSIMD_ARM_FP16) && defined(NSIMD_IS_GCC)
             {via_union}
           #elif (defined(NSIMD_ARM_FP16) && !defined(NSIMD_IS_GCC)) || \
                 defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \
                 defined(NSIMD_ONEAPI)
             {via_memcpy}
           #else
             {emulated}
           #endif'''.format(via_union=via_union, via_memcpy=via_memcpy,
                            emulated=emulated)
    if totyp == 'f16':
        if typ == 'u16':
            emulated = '''f16 ret;
                          ret.u = {in0};
                          return ret;'''.format(**fmtspec)
        else:
            emulated = '''f16 ret;
                          ret.u = nsimd_scalar_reinterpret_u16_i16({in0});
                          return ret;'''.format(**fmtspec)
        return \
        '''#if defined(NSIMD_ARM_FP16) && defined(NSIMD_IS_GCC)
             {via_union}
           #elif (defined(NSIMD_ARM_FP16) && !defined(NSIMD_IS_GCC)) || \
                 defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \
                 defined(NSIMD_ONEAPI)
             {via_memcpy}
           #else
             {emulated}
           #endif'''.format(via_union=via_union, via_memcpy=via_memcpy,
                            emulated=emulated)
    return '''#ifdef NSIMD_IS_GCC
                {via_union}
              #else
                {via_memcpy}
              #endif'''.format(via_union=via_union, via_memcpy=via_memcpy)

# -----------------------------------------------------------------------------

def cvt(totyp, typ):
    if totyp == typ:
        return 'return {in0};'.format(**fmtspec)
    if typ == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                      return ({totyp}){in0};
                  #else
                      return ({totyp})nsimd_f16_to_f32({in0});
                  #endif'''.format(**fmtspec)
    if totyp == 'f16':
        return '''#ifdef NSIMD_ARM_FP16
                      return (f16){in0};
                  #else
                      return nsimd_f32_to_f16((f32){in0});
                  #endif'''.format(**fmtspec)
    return 'return ({totyp}){in0};'.format(**fmtspec)

# -----------------------------------------------------------------------------

def adds(typ):
    if typ in common.ftypes:
        return opnum('{in0} + {in1}', typ)
    if typ in common.utypes:
        return '''{typ} tmp = ({typ})({in0} + {in1});
                  if (tmp < {in0} || tmp < {in1}) {{
                    return ({typ})-1;
                  }} else {{
                    return tmp;
                  }}
                  '''.format(**fmtspec)
    # Getting here means typ is signed
    int_max = 'NSIMD_' + typ.upper() + '_MAX'
    int_min = 'NSIMD_' + typ.upper() + '_MIN'
    return '''if (({in0} >= 0 && {in1} <= 0) || ({in0} <= 0 && {in1} >= 0)) {{
                return ({typ})({in0} + {in1});
              }} else {{
                if ({in0} > 0) {{
                  if ({in1} > {int_max} - {in0}) {{
                    return {int_max};
                  }} else {{
                    return ({typ})({in0} + {in1});
                  }}
                }} else {{
                  if ({in1} < {int_min} - {in0}) {{
                    return {int_min};
                  }} else {{
                    return ({typ})({in0} + {in1});
                  }}
                }}
              }}'''.format(int_min=int_min, int_max=int_max, **fmtspec)

# -----------------------------------------------------------------------------

def subs(typ):
    if typ in common.ftypes:
        return opnum('{in0} - {in1}', typ)
    if typ in common.utypes:
        return '''if ({in0} < {in1}) {{
                    return ({typ})0;
                  }} else {{
                    return ({typ})({in0} - {in1});
                  }}
                  '''.format(**fmtspec)
    # Getting here means typ is signed
    return 'return nsimd_scalar_adds_{typ}({in0}, ({typ})(-{in1}));'. \
           format(**fmtspec)

# -----------------------------------------------------------------------------

def get_impl(operator, totyp, typ):

    global fmtspec

    fmtspec = {
      'in0': common.in0,
      'in1': common.in1,
      'in2': common.in2,
      'typ': typ,
      'totyp': totyp,
      'typnbits': typ[1:]
    }

    if operator.name == 'trunc':
        if typ in common.iutypes:
            return 'return {in0};'.format(**fmtspec)
        elif typ == 'f16':
            c89_code = \
            '''f32 buf = nsimd_f16_to_f32({in0});
               return nsimd_f32_to_f16(buf >= 0.0f ?
                                       nsimd_scalar_floor_f32(buf) :
                                       nsimd_scalar_ceil_f32(buf));'''. \
                                       format(**fmtspec)
        else:
            c89_code = \
            '''return {in0} >= 0.0{f} ? nsimd_scalar_floor_{typ}({in0})
                      : nsimd_scalar_ceil_{typ}({in0});'''. \
                      format(f='f' if typ == 'f32' else '', **fmtspec)
        return libm_opn('trunc', 1, typ, True, c89_code)
    if operator.name == 'abs':
        if typ == 'f16':
            return '''f32 tmp = nsimd_f16_to_f32({in0});
                      return nsimd_f32_to_f16(tmp >= 0.0f ? tmp : -tmp);'''. \
                      format(**fmtspec)
        elif typ in common.utypes:
            return 'return {in0};'.format(**fmtspec)
        else:
            return 'return ({typ})({in0} >= ({typ})0 ? {in0} : -{in0});'. \
                   format(**fmtspec)
    if operator.name in ['min', 'max']:
        op = '<' if operator.name == 'min' else '>'
        if typ == 'f16':
            return '''f32 in0 = nsimd_f16_to_f32({in0});
                      f32 in1 = nsimd_f16_to_f32({in1});
                      return nsimd_f32_to_f16(in0 {op} in1 ? in0 : in1);'''. \
                      format(op=op, **fmtspec)
        else:
            return 'return {in0} {op} {in1} ? {in0} : {in1};'. \
                   format(op=op, **fmtspec)
    if operator.name == 'to_logical':
        if typ in common.iutypes:
            return 'return {in0} != ({typ})0;'.format(**fmtspec)
        else:
            return '''return nsimd_scalar_reinterpret_u{typnbits}_{typ}(
                               {in0}) != (u{typnbits})0;'''.format(**fmtspec)
    if operator.name == 'to_mask':
        if typ in common.utypes:
            return 'return ({typ})({in0} ? -1 : 0);'.format(**fmtspec)
        else:
            return '''return nsimd_scalar_reinterpret_{typ}_u{typnbits}((
                                 u{typnbits})({in0} ? -1 : 0));'''. \
                                 format(**fmtspec)
    if operator.name == 'round_to_even':
        return round_to_even(typ)
    if operator.name in ['floor', 'ceil', 'sqrt']:
        if typ in common.iutypes and operator.name != 'sqrt':
            return 'return {in0};'.format(**fmtspec)
        return libm_opn(operator.name, 1, typ, False, '')
    if operator.name == 'fma':
        if typ in common.iutypes:
            return 'return ({typ})({in0} * {in1} + {in2});'.format(**fmtspec)
        else:
            if typ == 'f16':
                c89_code = 'return nsimd_f32_to_f16(nsimd_f16_to_f32({in0}) ' \
                           '* nsimd_f16_to_f32({in1}) ' \
                           '+ nsimd_f16_to_f32({in2}));'.format(**fmtspec)
            else:
                c89_code = 'return {in0} * {in1} + {in2};'.format(**fmtspec)
            return libm_opn(operator.name, 3, typ, False, c89_code)
    if operator.name in ['fnma', 'fms', 'fnms']:
        neg = '-' if operator.name in ['fnms', 'fnma'] else ''
        op = '-' if operator.name in ['fms', 'fnms'] else '+'
        if typ in common.iutypes:
            return 'return ({typ})(({neg}{in0}) * {in1} {op} {in2});'. \
                   format(neg=neg, op=op, **fmtspec)
        else:
            typ2 = 'f32' if typ == 'f16' else typ
            return opnum(
            'nsimd_scalar_fma_{typ2}({neg}{{in0}}, {{in1}}, {op}{{in2}})'. \
            format(typ2=typ2, neg=neg, op=op, **fmtspec), typ)
    f = 'f' if typ in ['f16', 'f32'] else ''
    typ2 = 'f32' if typ == 'f16' else typ
    if operator.src:
        if typ == 'f16':
            return \
            '''return nsimd_f32_to_f16(
                        nsimd_sleef_{op_name}_scalar_f32({vas}));'''. \
                        format(op_name=operator.name,
                               vas=', '.join(['nsimd_f16_to_f32({})'. \
                               format(common.get_arg(i)) \
                               for i in range(len(operator.params[1:]))]),
                               **fmtspec)
        else:
            return 'return nsimd_sleef_{op_name}_scalar_{typ}({vas});'. \
                   format(op_name=operator.name,
                          vas=common.get_args(len(operator.params[1:])),
                          **fmtspec)
    func = {
        'orb': lambda: opbit('{in0} | {in1}', typ),
        'andb': lambda: opbit('{in0} & {in1}', typ),
        'andnotb': lambda: opbit('{in0} & (~{in1})', typ),
        'notb': lambda: opbit('~{in0}', typ),
        'xorb': lambda: opbit('{in0} ^ {in1}', typ),
        'add': lambda: opnum('{in0} + {in1}', typ),
        'sub': lambda: opnum('{in0} - {in1}', typ),
        'mul': lambda: opnum('{in0} * {in1}', typ),
        'div': lambda: opnum('{in0} / {in1}', typ),
        'neg': lambda: opnum('-{in0}', typ),
        'lt': lambda: cmp('{in0} < {in1}', typ),
        'gt': lambda: cmp('{in0} > {in1}', typ),
        'le': lambda: cmp('{in0} <= {in1}', typ),
        'ge': lambda: cmp('{in0} >= {in1}', typ),
        'ne': lambda: cmp('{in0} != {in1}', typ),
        'eq': lambda: cmp('{in0} == {in1}', typ),
        'andl': lambda: 'return {in0} && {in1};'.format(**fmtspec),
        'orl': lambda: 'return {in0} || {in1};'.format(**fmtspec),
        'xorl': lambda: 'return {in0} ^ {in1};'.format(**fmtspec),
        'andnotl': lambda: 'return {in0} && (!{in1});'.format(**fmtspec),
        'notl': lambda: 'return !{in0};'.format(**fmtspec),
        'shl': lambda: shift('shl', typ),
        'shr': lambda: shift('shr', typ),
        'shra': lambda: shift('shra', typ),
        'reinterpret': lambda: reinterpret(totyp, typ),
        'cvt': lambda: cvt(totyp, typ),
        'adds': lambda: adds(typ),
        'subs': lambda: subs(typ),
        'rec': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ),
        'rec8': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ),
        'rec11': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ),
        'rsqrt': lambda:
                 opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \
                 format(f=f, typ2=typ2), typ),
        'rsqrt8': lambda:
                  opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \
                  format(f=f, typ2=typ2), typ),
        'rsqrt11': lambda:
                   opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \
                   format(f=f, typ2=typ2), typ)
    }
    return func[operator.name]()


================================================
FILE: egg/x86_load_store_deg234.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import platform_x86 as x86
import common

sse = ['sse2', 'sse42']
avx = ['avx', 'avx2']
avx512 = ['avx512_knl', 'avx512_skylake']

###############################################################################
# Helper

def perm64(var1, var2, ind1, ind2):
    return '''_mm_castpd_si128(_mm_shuffle_pd(
                _mm_castsi128_pd({}), _mm_castsi128_pd(
                  {}), _MM_SHUFFLE2({}, {})))'''.format(var1, var2, ind1, ind2)

###############################################################################

def get_load_v0v1(simd_ext, typ, align, fmtspec):
    load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec)
    if typ in ['f32', 'f64']:
        return '''{styp} v0 = {load}(a0);
                  {styp} v1 = {load}(a0 + {le});'''. \
                  format(load=load, **fmtspec)
    else:
        return '''{styp} v0 = {load}(({styp}*)a0);
                  {styp} v1 = {load}(({styp}*)a0 + 1);'''. \
                  format(load=load, **fmtspec)

###############################################################################

def load2_sse(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1'] = get_load_v0v1('sse', typ, align, fmtspec)
    if typ in ['i8', 'u8']:
        if simd_ext == 'sse42':
            return \
            '''nsimd_sse42_v{typ}x2 ret;
               {load_v0v1}
               __m128i mask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14,
                                           12, 10, 8, 6, 4, 2, 0);
               __m128i A0 = _mm_shuffle_epi8(v0, mask);
               __m128i B0 = _mm_shuffle_epi8(v1, mask);
               ret.v0 = {perm0};
               ret.v1 = {perm1};
               return ret;'''. \
               format(perm0=perm64('A0', 'B0', '0', '0'),
                      perm1=perm64('A0', 'B0', '1', '1'), **fmtspec)
        else:
            return \
            '''nsimd_sse2_v{typ}x2 ret;
               {load_v0v1}
               __m128i A1 = _mm_unpacklo_epi8(v0, v1);
               __m128i B2 = _mm_unpackhi_epi8(v0, v1);
               __m128i A3 = _mm_unpacklo_epi8(A1, B2);
               __m128i B4 = _mm_unpackhi_epi8(A1, B2);
               __m128i A5 = _mm_unpacklo_epi8(A3, B4);
               __m128i B6 = _mm_unpackhi_epi8(A3, B4);
               ret.v0 = _mm_unpacklo_epi8(A5, B6);
               ret.v1 = _mm_unpackhi_epi8(A5, B6);
               return ret;'''.format(**fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'sse42':
            return \
            '''nsimd_sse42_v{typ}x2 ret;
               {load_v0v1}
               __m128i mask = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2,
                                           13, 12, 9, 8, 5, 4, 1, 0);
               __m128i A0 = _mm_shuffle_epi8(v0, mask);
               __m128i B0 = _mm_shuffle_epi8(v1, mask);
               ret.v0 = {perm0};
               ret.v1 = {perm1};
               return ret;'''. \
               format(perm0=perm64('A0', 'B0', '0', '0'),
                      perm1=perm64('A0', 'B0', '1', '1'), **fmtspec)
        else:
            return \
            '''nsimd_sse2_v{typ}x2 ret;
               {load_v0v1}
               __m128i v2 = _mm_unpacklo_epi16(v0, v1);
               __m128i v3 = _mm_unpackhi_epi16(v0, v1);
               __m128i v5 = _mm_unpacklo_epi16(v2, v3);
               __m128i v6 = _mm_unpackhi_epi16(v2, v3);
               ret.v0 = _mm_unpacklo_epi16(v5, v6);
               ret.v1 = _mm_unpackhi_epi16(v5, v6);
               return ret;'''.format(**fmtspec)
    if typ in ['i32', 'u32', 'f32']:
        return '''nsimd_{simd_ext}_v{typ}x2 ret;
                  {load_v0v1}
                  {styp} A0 = _mm_unpacklo{suf}(v0, v1);
                  {styp} B0 = _mm_unpackhi{suf}(v0, v1);
                  ret.v0 = _mm_unpacklo{suf}(A0, B0);
                  ret.v1 = _mm_unpackhi{suf}(A0, B0);
                  return ret;'''.format(**fmtspec)
    if typ in ['i64', 'u64', 'f64']:
        return '''nsimd_{simd_ext}_v{typ}x2 ret;
                  {load_v0v1}
                  ret.v0 = _mm_unpacklo{suf}(v0, v1);
                  ret.v1 = _mm_unpackhi{suf}(v0, v1);
                  return ret;'''.format(**fmtspec)

###############################################################################

def load2_avx(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0')
    fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0')
    fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1')
    fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1')
    fmtspec['load_v0v1'] = get_load_v0v1('avx', typ, align, fmtspec)
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x2 ret;
               {load_v0v1}

               __m256i mask = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14,
                                               1, 3, 5, 7, 9, 11, 13, 15,
                                               0, 2, 4, 6, 8, 10, 12, 14,
                                               1, 3, 5, 7, 9, 11, 13, 15);

               __m256i A1 = _mm256_shuffle_epi8(v0, mask);
               __m256i B1 = _mm256_shuffle_epi8(v1, mask);

               __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));
               __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));

               ret.v0 = _mm256_permute2f128_si256(A2, B2, 2 << 4);
               ret.v1 = _mm256_permute2f128_si256(A2, B2, (3 << 4) | 1);
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x2 ret;
               {load_v0v1}

               __m128i v0a = {exlo_v0};
               __m128i v0b = {exhi_v0};
               __m128i v1a = {exlo_v1};
               __m128i v1b = {exhi_v1};

               __m128i mask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1,
                                           14, 12, 10, 8, 6, 4, 2, 0);

               __m128i A0a = _mm_shuffle_epi8(v0a, mask);
               __m128i B0a = _mm_shuffle_epi8(v1a, mask);
               __m128i A1a = {perm_a0};
               __m128i B1a = {perm_a1};

               __m128i A0b = _mm_shuffle_epi8(v0b, mask);
               __m128i B0b = _mm_shuffle_epi8(v1b, mask);
               __m128i A1b = {perm_b0};
               __m128i B1b = {perm_b1};

               ret.v0 = {merge_A1};
               ret.v1 = {merge_B1};
               return ret;'''. \
               format(merge_A1=x86.setr('avx', typ, 'A1a', 'A1b'),
                      merge_B1=x86.setr('avx', typ, 'B1a', 'B1b'),
                      perm_a0=perm64('A0a', 'B0a', '0', '0'),
                      perm_a1=perm64('A0a', 'B0a', '1', '1'),
                      perm_b0=perm64('A0b', 'B0b', '0', '0'),
                      perm_b1=perm64('A0b', 'B0b', '1', '1'), **fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x2 ret;
               {load_v0v1}

               __m256i A1 = _mm256_unpacklo_epi16(v0, v1);
               __m256i B1 = _mm256_unpackhi_epi16(v0, v1);
               __m256i A2 = _mm256_unpacklo_epi16(A1, B1);
               __m256i B2 = _mm256_unpackhi_epi16(A1, B1);
               ret.v0 = _mm256_unpacklo_epi16(A2, B2);
               ret.v1 = _mm256_unpackhi_epi16(A2, B2);
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x2 ret;
               {load_v0v1}

               __m128i Aa = {exlo_v0};
               __m128i Ba = {exhi_v0};
               __m128i Ab = {exlo_v1};
               __m128i Bb = {exhi_v1};

               __m128i mask = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2,
                                           13, 12, 9, 8, 5, 4, 1, 0);

               __m128i XY0 = _mm_shuffle_epi8(Aa, mask);
               __m128i XY1 = _mm_shuffle_epi8(Ba, mask);
               __m128i Xa = {perm0};
               __m128i Ya = {perm1};

               XY0 = _mm_shuffle_epi8(Ab, mask);
               XY1 = _mm_shuffle_epi8(Bb, mask);
               __m128i Xb = {perm0};
               __m128i Yb = {perm1};

               ret.v0 = {mergeX};
               ret.v1 = {mergeY};

               return ret;'''. \
               format(perm0=perm64('XY0', 'XY1', '0', '0'),
                      perm1=perm64('XY0', 'XY1', '1', '1'),
                      mergeX=x86.setr('avx', typ, 'Xa', 'Xb'),
                      mergeY=x86.setr('avx', typ, 'Ya', 'Yb'), **fmtspec)
    if typ == 'f32':
        return '''nsimd_{simd_ext}_vf32x2 ret;
                  {load_v0v1}
                  __m256 A1 = _mm256_unpacklo_ps(v0, v1);
                  __m256 B1 = _mm256_unpackhi_ps(v0, v1);
                  ret.v0 = _mm256_unpacklo_ps(A1, B1);
                  ret.v1 = _mm256_unpackhi_ps(A1, B1);
                  return ret;'''.format(**fmtspec)
    if typ in ['i32', 'u32']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x2 ret;
               {load_v0v1}
               __m256i A1 = _mm256_unpacklo_epi32(v0, v1);
               __m256i B1 = _mm256_unpackhi_epi32(v0, v1);
               ret.v0 = _mm256_unpacklo_epi32(A1, B1);
               ret.v1 = _mm256_unpackhi_epi32(A1, B1);
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x2 ret;
               nsimd_avx_vf32x2 retf32 = nsimd_load2{a}_avx_f32((f32 *){in0});
               ret.v0 = _mm256_castps_si256(retf32.v0);
               ret.v1 = _mm256_castps_si256(retf32.v1);
               return ret;'''.format(**fmtspec)
    if typ == 'f64':
        return '''nsimd_{simd_ext}_vf64x2 ret;
                  {load_v0v1}
                  ret.v0 = _mm256_unpacklo_pd(v0, v1);
                  ret.v1 = _mm256_unpackhi_pd(v0, v1);
                  return ret;'''.format(**fmtspec)
    if typ in ['i64', 'u64']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x2 ret;
               {load_v0v1}
               ret.v0 = _mm256_unpacklo_epi64(v0, v1);
               ret.v1 = _mm256_unpackhi_epi64(v0, v1);
               return ret;'''.format(**fmtspec)
        else:
            return \
             '''nsimd_avx_v{typ}x2 ret;
                nsimd_avx_vf64x2 retf64 = nsimd_load2{a}_avx_f64((f64 *){in0});
                ret.v0 = _mm256_castpd_si256(retf64.v0);
                ret.v1 = _mm256_castpd_si256(retf64.v1);
                return ret;'''.format(**fmtspec)

###############################################################################

def load2_avx512(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0')
    fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0')
    fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1')
    fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1')
    fmtspec['load_v0v1'] = get_load_v0v1(simd_ext, typ, align, fmtspec)
    if typ in ['i8', 'u8']:
        return \
        '''nsimd_{simd_ext}_v{typ}x2 ret;
           {load_v0v1}

           __m256i A0 = {exlo_v0};
           __m256i B0 = {exhi_v0};
           __m256i C0 = {exlo_v1};
           __m256i D0 = {exhi_v1};

           __m256i mask = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14,
                                           1, 3, 5, 7, 9, 11, 13, 15,
                                           0, 2, 4, 6, 8, 10, 12, 14,
                                           1, 3, 5, 7, 9, 11, 13, 15);

           __m256i A1 = _mm256_shuffle_epi8(A0, mask);
           __m256i B1 = _mm256_shuffle_epi8(B0, mask);
           __m256i C1 = _mm256_shuffle_epi8(C0, mask);
           __m256i D1 = _mm256_shuffle_epi8(D0, mask);

           __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));
           __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));
           __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));
           __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));

           __m256i A3 = _mm256_permute2f128_si256(A2, B2, 2 << 4);
           __m256i B3 = _mm256_permute2f128_si256(A2, B2, (3 << 4) | 1);
           __m256i C3 = _mm256_permute2f128_si256(C2, D2, 2 << 4);
           __m256i D3 = _mm256_permute2f128_si256(C2, D2, (3 << 4) | 1);

           ret.v0 = {mergeAC};
           ret.v1 = {mergeBD};
           return ret;'''.format(mergeAC=x86.setr(simd_ext, typ, 'A3', 'C3'),
                                 mergeBD=x86.setr(simd_ext, typ, 'B3', 'D3'),
                                 **fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''nsimd_{simd_ext}_v{typ}x2 ret;
           {load_v0v1}

           __m256i A0a = {exlo_v0};
           __m256i B0a = {exhi_v0};
           __m256i A0b = {exlo_v1};
           __m256i B0b = {exhi_v1};

           __m256i A1 = _mm256_unpacklo_epi16(A0a, B0a);
           __m256i B1 = _mm256_unpackhi_epi16(A0a, B0a);
           __m256i A2 = _mm256_unpacklo_epi16(A1, B1);
           __m256i B2 = _mm256_unpackhi_epi16(A1, B1);
           __m256i A3a = _mm256_unpacklo_epi16(A2, B2);
           __m256i B3a = _mm256_unpackhi_epi16(A2, B2);

           A1 = _mm256_unpacklo_epi16(A0b, B0b);
           B1 = _mm256_unpackhi_epi16(A0b, B0b);
           A2 = _mm256_unpacklo_epi16(A1, B1);
           B2 = _mm256_unpackhi_epi16(A1, B1);
           __m256i A3b = _mm256_unpacklo_epi16(A2, B2);
           __m256i B3b = _mm256_unpackhi_epi16(A2, B2);

           ret.v0 = {mergeA};
           ret.v1 = {mergeB};
           return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A3a', 'A3b'),
                                 mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'),
                                 **fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''nsimd_{simd_ext}_v{typ}x2 ret;
           {load_v0v1}
           __m512i mask1 = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
                                             16, 18, 20, 22, 24, 26, 28, 30);
           __m512i mask2 = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15,
                                             17, 19, 21, 23, 25, 27, 29, 31);
           ret.v0 = _mm512_permutex2var{suf}(v0, mask1, v1);
           ret.v1 = _mm512_permutex2var{suf}(v0, mask2, v1);
           return ret;'''.format(**fmtspec)
    if typ in ['f64', 'i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_v{typ}x2 ret;
           {load_v0v1}
           ret.v0 = _mm512_unpacklo{suf}(v0, v1);
           ret.v1 = _mm512_unpackhi{suf}(v0, v1);
           return ret;'''.format(**fmtspec)

###############################################################################

def store2(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['store'] = '{pre}store{a}{sufsi}'.format(a='' if align else 'u',
                                                     **fmtspec)
    if typ in ['f32', 'f64']:
        dest1 = '{in0}'.format(**fmtspec)
        dest2 = '{in0} + {le}'.format(**fmtspec)
    else:
        dest1 = '(__m{nbits}i *){in0}'.format(**fmtspec)
        dest2 = '(__m{nbits}i *){in0} + 1'.format(**fmtspec)
    normal = '''{store}({dest1}, {pre}unpacklo{suf}({in1}, {in2}));
                {store}({dest2}, {pre}unpackhi{suf}({in1}, {in2}));'''. \
                format(dest1=dest1, dest2=dest2, **fmtspec)
    if simd_ext in sse:
        return normal
    fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1)
    fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1)
    fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2)
    fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2)
    fmtspec['normal'] = normal
    fmtspec['dest1'] = dest1
    fmtspec['dest2'] = dest2
    if simd_ext == 'avx2':
        if typ in ['i8', 'u8']:
            return \
            '''__m256i A1 = _mm256_permute2f128_si256({in1}, {in2}, 2 << 4);
               __m256i B1 = _mm256_permute2f128_si256(
                              {in1}, {in2}, (3 << 4) | 1);

               __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));
               __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));

               __m256i mask = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11,
                                               4, 12, 5, 13, 6, 14, 7, 15,
                                               0, 8, 1, 9, 2, 10, 3, 11,
                                               4, 12, 5, 13, 6, 14, 7, 15);

               {store}({dest1}, _mm256_shuffle_epi8(A2, mask));
               {store}({dest2}, _mm256_shuffle_epi8(B2, mask));'''. \
               format(**fmtspec)
        if typ in ['i16', 'u16']:
            return normal
    if simd_ext == 'avx':
        if typ in ['i8', 'u8']:
            return \
            '''__m128i v0a = {exlo_in1};
               __m128i v0b = {exhi_in1};
               __m128i v1a = {exlo_in2};
               __m128i v1b = {exhi_in2};

               __m128i A1a = _mm_unpacklo_epi8(v0a, v1a);
               __m128i B1a = _mm_unpackhi_epi8(v0a, v1a);
               __m128i A1b = _mm_unpacklo_epi8(v0b, v1b);
               __m128i B1b = _mm_unpackhi_epi8(v0b, v1b);

               __m256i A1 = {mergeA1};
               __m256i B1 = {mergeB1};

               {store}({dest1}, A1);
               {store}({dest2}, B1);'''. \
               format(mergeA1=x86.setr('avx', typ, 'A1a', 'A1b'),
                      mergeB1=x86.setr('avx', typ, 'B1a', 'B1b'),
                      **fmtspec)
        if typ in ['i16', 'u16']:
            return \
            '''__m128i Xa = {exlo_in1};
               __m128i Xb = {exhi_in1};
               __m128i Ya = {exlo_in2};
               __m128i Yb = {exhi_in2};

               __m128i A0 = _mm_unpacklo_epi16(Xa, Ya);
               __m128i B0 = _mm_unpackhi_epi16(Xa, Ya);
               __m128i A1 = _mm_unpacklo_epi16(Xb, Yb);
               __m128i B1 = _mm_unpackhi_epi16(Xb, Yb);

               __m256i A = {merge0};
               __m256i B = {merge1};

               {store}({dest1}, A);
               {store}({dest2}, B);'''. \
               format(merge0=x86.setr('avx', typ, 'A0', 'B0'),
                      merge1=x86.setr('avx', typ, 'A1', 'B1'),
                      **fmtspec)
    if (simd_ext in avx and typ in ['f32', 'f64']) or \
       simd_ext == 'avx2' and typ in ['i32', 'u32', 'i64', 'u64']:
        return normal
    if simd_ext == 'avx' and typ in ['i32', 'u32', 'i64', 'u64']:
        ftyp = '__m256' if typ in ['i32', 'u32'] else '__m256d'
        fsuf = 'ps' if typ in ['i32', 'u32'] else 'pd'
        return '''{ftyp} v0 = _mm256_castsi256_{fsuf}({in1});
                  {ftyp} v1 = _mm256_castsi256_{fsuf}({in2});
                  {store}({dest1}, _mm256_cast{fsuf}_si256(
                          _mm256_unpacklo_{fsuf}(v0, v1)));
                  {store}({dest2}, _mm256_cast{fsuf}_si256(
                          _mm256_unpackhi_{fsuf}(v0, v1)));'''. \
                          format(ftyp=ftyp, fsuf=fsuf, **fmtspec)
    if simd_ext in avx512:
        if typ in ['i8', 'u8']:
            return \
            '''__m256i A1 = {exlo_in1};
               __m256i B1 = {exhi_in1};
               __m256i C1 = {exlo_in2};
               __m256i D1 = {exhi_in2};

               __m256i A2 = _mm256_permute2f128_si256(A1, C1, 2 << 4);
               __m256i B2 = _mm256_permute2f128_si256(A1, C1, (3 << 4) | 1);
               __m256i C2 = _mm256_permute2f128_si256(B1, D1, 2 << 4);
               __m256i D2 = _mm256_permute2f128_si256(B1, D1, (3 << 4) | 1);

               __m256i A3 = _mm256_permute4x64_epi64(A2, _MM_SHUFFLE(3,1,2,0));
               __m256i B3 = _mm256_permute4x64_epi64(B2, _MM_SHUFFLE(3,1,2,0));
               __m256i C3 = _mm256_permute4x64_epi64(C2, _MM_SHUFFLE(3,1,2,0));
               __m256i D3 = _mm256_permute4x64_epi64(D2, _MM_SHUFFLE(3,1,2,0));

               __m256i mask = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11,
                                               4, 12, 5, 13, 6, 14, 7, 15,
                                               0, 8, 1, 9, 2, 10, 3, 11,
                                               4, 12, 5, 13, 6, 14, 7, 15);

               __m256i A4 = _mm256_shuffle_epi8(A3, mask);
               __m256i B4 = _mm256_shuffle_epi8(B3, mask);
               __m256i C4 = _mm256_shuffle_epi8(C3, mask);
               __m256i D4 = _mm256_shuffle_epi8(D3, mask);

               {store}({dest1}, {mergeAB});
               {store}({dest2}, {mergeCD});'''. \
               format(mergeAB=x86.setr(simd_ext, typ, 'A4', 'B4'),
                      mergeCD=x86.setr(simd_ext, typ, 'C4', 'D4'),
                      **fmtspec)
        if typ in ['i16', 'u16']:
            return \
            '''__m256i A0a = {exlo_in1};
               __m256i A0b = {exhi_in1};
               __m256i B0a = {exlo_in2};
               __m256i B0b = {exhi_in2};

               __m256i A1a = _mm256_unpacklo_epi16(A0a, B0a);
               __m256i B1a = _mm256_unpackhi_epi16(A0a, B0a);
               __m256i A1b = _mm256_unpacklo_epi16(A0b, B0b);
               __m256i B1b = _mm256_unpackhi_epi16(A0b, B0b);

               {store}({dest1}, {mergea});
               {store}({dest2}, {mergeb});'''.\
               format(mergea=x86.setr(simd_ext, typ, 'A1a', 'B1a'),
                      mergeb=x86.setr(simd_ext, typ, 'A1b', 'B1b'),
                      **fmtspec)
        if typ in ['i32', 'f32', 'u32']:
            return \
            '''__m512i mask1 = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19,
                                                 4, 20, 5, 21, 6, 22, 7, 23);
               __m512i mask2 = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27,
                                                 12, 28, 13, 29, 14, 30, 15,
                                                 31);
               {store}({dest1}, _mm512_permutex2var{suf}({in1}, mask1, {in2}));
               {store}({dest2}, _mm512_permutex2var{suf}(
                   {in1}, mask2, {in2}));'''.format(**fmtspec)
        if typ in ['i64', 'u64', 'f64']:
            return \
            '''{store}({dest1}, _mm512_unpacklo{suf}({in1}, {in2}));
               {store}({dest2}, _mm512_unpackhi{suf}({in1}, {in2}));'''. \
               format(**fmtspec)

###############################################################################

def get_load_v0v1v2v3(simd_ext, typ, align, fmtspec):
    load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec)
    if typ in ['f32', 'f64']:
        return '''{styp} v0 = {load}(a0);
                  {styp} v1 = {load}(a0 + {le});
                  {styp} v2 = {load}(a0 + (2 * {le}));
                  {styp} v3 = {load}(a0 + (3 * {le}));'''. \
                  format(load=load, **fmtspec)
    else:
        return '''{styp} v0 = {load}(({styp}*)a0);
                  {styp} v1 = {load}(({styp}*)a0 + 1);
                  {styp} v2 = {load}(({styp}*)a0 + 2);
                  {styp} v3 = {load}(({styp}*)a0 + 3);'''. \
                  format(load=load, **fmtspec)

###############################################################################

def load4_sse(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3('sse', typ, align, fmtspec)
    if typ in ['i8', 'u8']:
        if simd_ext == 'sse42':
            return \
            '''nsimd_sse42_v{typ}x4 ret;
               {load_v0v1v2v3}
               __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2,
                                           13,  9, 5, 1, 12,  8, 4, 0);
               __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(v0, mask));
               __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(v1, mask));
               __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(v2, mask));
               __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(v3, mask));

               __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,
                                                        _MM_SHUFFLE2(0, 0)));
               __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,
                                                        _MM_SHUFFLE2(0, 0)));
               __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,
                                                        _MM_SHUFFLE2(1, 1)));
               __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,
                                                        _MM_SHUFFLE2(1, 1)));

               ret.v0 = _mm_castps_si128(_mm_shuffle_ps(
                            A2, A3, _MM_SHUFFLE(2, 0, 2, 0)));
               ret.v1 = _mm_castps_si128(_mm_shuffle_ps(
                            A2, A3, _MM_SHUFFLE(3, 1, 3, 1)));
               ret.v2 = _mm_castps_si128(_mm_shuffle_ps(
                            C2, C3, _MM_SHUFFLE(2, 0, 2, 0)));
               ret.v3 = _mm_castps_si128(_mm_shuffle_ps(
                            C2, C3, _MM_SHUFFLE(3, 1, 3, 1)));
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_sse2_v{typ}x4 ret;
               {load_v0v1v2v3}
               __m128i A1 = _mm_unpacklo_epi8(v0, v2);
               __m128i B1 = _mm_unpackhi_epi8(v0, v2);
               __m128i C1 = _mm_unpacklo_epi8(v1, v3);
               __m128i D1 = _mm_unpackhi_epi8(v1, v3);

               __m128i A2 = _mm_unpacklo_epi8(A1, C1);
               __m128i B2 = _mm_unpackhi_epi8(A1, C1);
               __m128i C2 = _mm_unpacklo_epi8(B1, D1);
               __m128i D2 = _mm_unpackhi_epi8(B1, D1);

               __m128i A3 = _mm_unpacklo_epi8(A2, C2);
               __m128i B3 = _mm_unpackhi_epi8(A2, C2);
               __m128i C3 = _mm_unpacklo_epi8(B2, D2);
               __m128i D3 = _mm_unpackhi_epi8(B2, D2);

               ret.v0 = _mm_unpacklo_epi8(A3, C3);
               ret.v1 = _mm_unpackhi_epi8(A3, C3);
               ret.v2 = _mm_unpacklo_epi8(B3, D3);
               ret.v3 = _mm_unpackhi_epi8(B3, D3);
               return ret;'''.format(**fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;
           {load_v0v1v2v3}
           __m128i E = _mm_unpacklo_epi16(v0,v1);
           __m128i F = _mm_unpackhi_epi16(v0,v1);
           __m128i G = _mm_unpacklo_epi16(v2,v3);
           __m128i H = _mm_unpackhi_epi16(v2,v3);

           __m128i I = _mm_unpacklo_epi16(E,F);
           __m128i J = _mm_unpackhi_epi16(E,F);
           __m128i K = _mm_unpacklo_epi16(G,H);
           __m128i L = _mm_unpackhi_epi16(G,H);

           ret.v0 = _mm_unpacklo_epi64(I,K);
           ret.v1 = _mm_unpackhi_epi64(I,K);
           ret.v2 = _mm_unpacklo_epi64(J,L);
           ret.v3 = _mm_unpackhi_epi64(J,L);
           return ret;'''.format(**fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;
           {load_v0v1v2v3}
           {styp} A1 = _mm_unpacklo{suf}(v0, v2);
           {styp} B1 = _mm_unpackhi{suf}(v0, v2);
           {styp} C1 = _mm_unpacklo{suf}(v1, v3);
           {styp} D1 = _mm_unpackhi{suf}(v1, v3);

           ret.v0 = _mm_unpacklo{suf}(A1, C1);
           ret.v1 = _mm_unpackhi{suf}(A1, C1);
           ret.v2 = _mm_unpacklo{suf}(B1, D1);
           ret.v3 = _mm_unpackhi{suf}(B1, D1);

           return ret;'''.format(**fmtspec)
    if typ in ['f64', 'i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;
           {load_v0v1v2v3}
           ret.v0 = _mm_unpacklo{suf}(v0, v2);
           ret.v1 = _mm_unpackhi{suf}(v0, v2);
           ret.v2 = _mm_unpacklo{suf}(v1, v3);
           ret.v3 = _mm_unpackhi{suf}(v1, v3);
           return ret;'''.format(**fmtspec)

###############################################################################

def load4_avx(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3('avx', typ, align, fmtspec)
    fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0')
    fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0')
    fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1')
    fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1')
    fmtspec['exlo_v2'] = x86.extract('avx', typ, x86.LO, 'v2')
    fmtspec['exhi_v2'] = x86.extract('avx', typ, x86.HI, 'v2')
    fmtspec['exlo_v3'] = x86.extract('avx', typ, x86.LO, 'v3')
    fmtspec['exhi_v3'] = x86.extract('avx', typ, x86.HI, 'v3')
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x4 ret;
               {load_v0v1v2v3}

               __m256i mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
                                               2, 6, 10, 14, 3, 7, 11, 15,
                                               0, 4, 8, 12, 1, 5, 9, 13, 2,
                                               6, 10, 14, 3, 7, 11, 15);
               __m256i mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);

               __m256i A1 = _mm256_shuffle_epi8(v0, mask);
               __m256i B1 = _mm256_shuffle_epi8(v1, mask);
               __m256i C1 = _mm256_shuffle_epi8(v2, mask);
               __m256i D1 = _mm256_shuffle_epi8(v3, mask);

               __m256i A2 = _mm256_permutevar8x32_epi32(A1, mask2);
               __m256i B2 = _mm256_permutevar8x32_epi32(B1, mask2);
               __m256i C2 = _mm256_permutevar8x32_epi32(C1, mask2);
               __m256i D2 = _mm256_permutevar8x32_epi32(D1, mask2);

               __m256i A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4);
               __m256i C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4);
               __m256i B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1);
               __m256i D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1);

               ret.v0 = _mm256_unpacklo_epi64(A3, C3);
               ret.v1 = _mm256_unpackhi_epi64(A3, C3);
               ret.v2 = _mm256_unpacklo_epi64(B3, D3);
               ret.v3 = _mm256_unpackhi_epi64(B3, D3);
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x4 ret;
               {load_v0v1v2v3}

               __m128i Aa = {exlo_v0};
               __m128i Ba = {exhi_v0};
               __m128i Ca = {exlo_v1};
               __m128i Da = {exhi_v1};
               __m128i Ab = {exlo_v2};
               __m128i Bb = {exhi_v2};
               __m128i Cb = {exlo_v3};
               __m128i Db = {exhi_v3};

               __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2,
                                           13,  9, 5, 1, 12,  8, 4, 0);

               __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Aa, mask));
               __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ba, mask));
               __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ca, mask));
               __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Da, mask));

               __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,
                                                        _MM_SHUFFLE2(0, 0)));
               __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,
                                                        _MM_SHUFFLE2(0, 0)));
               __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,
                                                        _MM_SHUFFLE2(1, 1)));
               __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,
                                                        _MM_SHUFFLE2(1, 1)));

               __m128i Wa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Xa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));
               __m128i Ya = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Za = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));

               A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ab, mask));
               B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Bb, mask));
               C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Cb, mask));
               D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Db, mask));

               A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0)));
               A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0)));
               C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1)));
               C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1)));

               __m128i Wb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Xb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));
               __m128i Yb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Zb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));

               ret.v0 = {mergeW};
               ret.v1 = {mergeX};
               ret.v2 = {mergeY};
               ret.v3 = {mergeZ};

               return ret;'''.format(mergeW=x86.setr('avx', typ, 'Wa', 'Wb'),
                                     mergeX=x86.setr('avx', typ, 'Xa', 'Xb'),
                                     mergeY=x86.setr('avx', typ, 'Ya', 'Yb'),
                                     mergeZ=x86.setr('avx', typ, 'Za', 'Zb'),
                                     **fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x4 ret;
               {load_v0v1v2v3}

               __m256i A1 = _mm256_unpacklo_epi16(v0, v2);
               __m256i B1 = _mm256_unpackhi_epi16(v0, v2);
               __m256i C1 = _mm256_unpacklo_epi16(v1, v3);
               __m256i D1 = _mm256_unpackhi_epi16(v1, v3);

               __m256i A2 = _mm256_unpacklo_epi16(A1, C1);
               __m256i B2 = _mm256_unpackhi_epi16(A1, C1);
               __m256i C2 = _mm256_unpacklo_epi16(B1, D1);
               __m256i D2 = _mm256_unpackhi_epi16(B1, D1);

               ret.v0 = _mm256_unpacklo_epi16(A2, C2);
               ret.v1 = _mm256_unpackhi_epi16(A2, C2);
               ret.v2 = _mm256_unpacklo_epi16(B2, D2);
               ret.v3 = _mm256_unpackhi_epi16(B2, D2);

               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x4 ret;
               {load_v0v1v2v3}

               __m128i Aa = {exlo_v0};
               __m128i Ba = {exhi_v0};
               __m128i Ca = {exlo_v1};
               __m128i Da = {exhi_v1};
               __m128i Ab = {exlo_v2};
               __m128i Bb = {exhi_v2};
               __m128i Cb = {exlo_v3};
               __m128i Db = {exhi_v3};

               __m128i mask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4,
                                           11, 10, 3, 2,  9,  8, 1, 0);
               __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Aa, mask));
               __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ba, mask));
               __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ca, mask));
               __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Da, mask));

               __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,
                                                        _MM_SHUFFLE2(0, 0)));
               __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,
                                                        _MM_SHUFFLE2(0, 0)));
               __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,
                                                        _MM_SHUFFLE2(1, 1)));
               __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,
                                                        _MM_SHUFFLE2(1, 1)));

               __m128i Wa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Xa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));
               __m128i Ya = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Za = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));

               A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ab, mask));
               B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Bb, mask));
               C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Cb, mask));
               D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Db, mask));

               A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0)));
               A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0)));
               C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1)));
               C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1)));

               __m128i Wb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Xb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));
               __m128i Yb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(2, 0, 2, 0)));
               __m128i Zb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,
                                             _MM_SHUFFLE(3, 1, 3, 1)));

               ret.v0 = {mergeW};
               ret.v1 = {mergeX};
               ret.v2 = {mergeY};
               ret.v3 = {mergeZ};

               return ret;'''.format(mergeW=x86.setr('avx', typ, 'Wa', 'Wb'),
                                     mergeX=x86.setr('avx', typ, 'Xa', 'Xb'),
                                     mergeY=x86.setr('avx', typ, 'Ya', 'Yb'),
                                     mergeZ=x86.setr('avx', typ, 'Za', 'Zb'),
                                     **fmtspec)
    if typ == 'f32':
        return '''nsimd_{simd_ext}_vf32x4 ret;
                  {load_v0v1v2v3}
                  __m256 A1 = _mm256_unpacklo_ps(v0, v2);
                  __m256 B1 = _mm256_unpackhi_ps(v0, v2);
                  __m256 C1 = _mm256_unpacklo_ps(v1, v3);
                  __m256 D1 = _mm256_unpackhi_ps(v1, v3);

                  ret.v0 = _mm256_unpacklo_ps(A1, C1);
                  ret.v1 = _mm256_unpackhi_ps(A1, C1);
                  ret.v2 = _mm256_unpacklo_ps(B1, D1);
                  ret.v3 = _mm256_unpackhi_ps(B1, D1);
                  return ret;'''.format(**fmtspec)
    if typ in ['i32', 'u32']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x4 ret;
               {load_v0v1v2v3}

               __m256i A1 = _mm256_unpacklo_epi32(v0, v2);
               __m256i B1 = _mm256_unpackhi_epi32(v0, v2);
               __m256i C1 = _mm256_unpacklo_epi32(v1, v3);
               __m256i D1 = _mm256_unpackhi_epi32(v1, v3);

               ret.v0 = _mm256_unpacklo_epi32(A1, C1);
               ret.v1 = _mm256_unpackhi_epi32(A1, C1);
               ret.v2 = _mm256_unpacklo_epi32(B1, D1);
               ret.v3 = _mm256_unpackhi_epi32(B1, D1);

               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x4 ret;
               nsimd_avx_vf32x4 retf32 = nsimd_load4{a}_avx_f32((f32 *){in0});
               ret.v0 = _mm256_castps_si256(retf32.v0);
               ret.v1 = _mm256_castps_si256(retf32.v1);
               ret.v2 = _mm256_castps_si256(retf32.v2);
               ret.v3 = _mm256_castps_si256(retf32.v3);
               return ret;'''.format(**fmtspec)
    if typ == 'f64':
        return \
        '''nsimd_{simd_ext}_vf64x4 ret;
           {load_v0v1v2v3}

           __m256d A1 = _mm256_permute2f128_pd(v0, v2, 2 << 4);
           __m256d B1 = _mm256_permute2f128_pd(v0, v2, (3 << 4) | 1);
           __m256d C1 = _mm256_permute2f128_pd(v1, v3, 2 << 4);
           __m256d D1 = _mm256_permute2f128_pd(v1, v3, (3 << 4) | 1);

           ret.v0 = _mm256_unpacklo_pd(A1, C1);
           ret.v1 = _mm256_unpackhi_pd(A1, C1);
           ret.v2 = _mm256_unpacklo_pd(B1, D1);
           ret.v3 = _mm256_unpackhi_pd(B1, D1);
           return ret;'''.format(**fmtspec)
    if typ in ['i64', 'u64']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x4 ret;
               {load_v0v1v2v3}

               __m256i A1 = _mm256_permute2f128_si256(v0, v2, 2 << 4);
               __m256i B1 = _mm256_permute2f128_si256(v0, v2, (3 << 4) | 1);
               __m256i C1 = _mm256_permute2f128_si256(v1, v3, 2 << 4);
               __m256i D1 = _mm256_permute2f128_si256(v1, v3, (3 << 4) | 1);

               ret.v0 = _mm256_unpacklo_epi64(A1, C1);
               ret.v1 = _mm256_unpackhi_epi64(A1, C1);
               ret.v2 = _mm256_unpacklo_epi64(B1, D1);
               ret.v3 = _mm256_unpackhi_epi64(B1, D1);

               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_vf64x4 retf64 = nsimd_load4{a}_avx_f64((f64 *){in0});
               nsimd_avx_v{typ}x4 ret;
               ret.v0 = _mm256_castpd_si256(retf64.v0);
               ret.v1 = _mm256_castpd_si256(retf64.v1);
               ret.v2 = _mm256_castpd_si256(retf64.v2);
               ret.v3 = _mm256_castpd_si256(retf64.v3);
               return ret;'''.format(**fmtspec)

###############################################################################

def load4_avx512(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3(simd_ext, typ, align, fmtspec)
    fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0')
    fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0')
    fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1')
    fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1')
    fmtspec['exlo_v2'] = x86.extract(simd_ext, typ, x86.LO, 'v2')
    fmtspec['exhi_v2'] = x86.extract(simd_ext, typ, x86.HI, 'v2')
    fmtspec['exlo_v3'] = x86.extract(simd_ext, typ, x86.LO, 'v3')
    fmtspec['exhi_v3'] = x86.extract(simd_ext, typ, x86.HI, 'v3')
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;

           {load_v0v1v2v3}

           __m256i A0a = {exlo_v0};
           __m256i B0a = {exhi_v0};
           __m256i C0a = {exlo_v1};
           __m256i D0a = {exhi_v1};
           __m256i A0b = {exlo_v2};
           __m256i B0b = {exhi_v2};
           __m256i C0b = {exlo_v3};
           __m256i D0b = {exhi_v3};

           __m256i mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,
                                           2, 6, 10, 14, 3, 7, 11, 15,
                                           0, 4, 8, 12, 1, 5, 9, 13,
                                           2, 6, 10, 14, 3, 7, 11, 15);
           __m256i mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);

           __m256i A1 = _mm256_shuffle_epi8(A0a, mask);
           __m256i B1 = _mm256_shuffle_epi8(B0a, mask);
           __m256i C1 = _mm256_shuffle_epi8(C0a, mask);
           __m256i D1 = _mm256_shuffle_epi8(D0a, mask);

           __m256i A2 = _mm256_permutevar8x32_epi32(A1, mask2);
           __m256i B2 = _mm256_permutevar8x32_epi32(B1, mask2);
           __m256i C2 = _mm256_permutevar8x32_epi32(C1, mask2);
           __m256i D2 = _mm256_permutevar8x32_epi32(D1, mask2);

           __m256i A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4);
           __m256i C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4);
           __m256i B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1);
           __m256i D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1);

           __m256i A4a = _mm256_unpacklo_epi64(A3, C3);
           __m256i B4a = _mm256_unpackhi_epi64(A3, C3);
           __m256i C4a = _mm256_unpacklo_epi64(B3, D3);
           __m256i D4a = _mm256_unpackhi_epi64(B3, D3);

           A1 = _mm256_shuffle_epi8(A0b, mask);
           B1 = _mm256_shuffle_epi8(B0b, mask);
           C1 = _mm256_shuffle_epi8(C0b, mask);
           D1 = _mm256_shuffle_epi8(D0b, mask);

           A2 = _mm256_permutevar8x32_epi32(A1, mask2);
           B2 = _mm256_permutevar8x32_epi32(B1, mask2);
           C2 = _mm256_permutevar8x32_epi32(C1, mask2);
           D2 = _mm256_permutevar8x32_epi32(D1, mask2);

           A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4);
           C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4);
           B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1);
           D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1);

           __m256i A4b = _mm256_unpacklo_epi64(A3, C3);
           __m256i B4b = _mm256_unpackhi_epi64(A3, C3);
           __m256i C4b = _mm256_unpacklo_epi64(B3, D3);
           __m256i D4b = _mm256_unpackhi_epi64(B3, D3);

           ret.v0 = {mergeA};
           ret.v1 = {mergeB};
           ret.v2 = {mergeC};
           ret.v3 = {mergeD};

           return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A4a', 'A4b'),
                                 mergeB=x86.setr(simd_ext, typ, 'B4a', 'B4b'),
                                 mergeC=x86.setr(simd_ext, typ, 'C4a', 'C4b'),
                                 mergeD=x86.setr(simd_ext, typ, 'D4a', 'D4b'),
                                 **fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;

           {load_v0v1v2v3}

           __m256i A0a = {exlo_v0};
           __m256i B0a = {exhi_v0};
           __m256i C0a = {exlo_v1};
           __m256i D0a = {exhi_v1};
           __m256i A0b = {exlo_v2};
           __m256i B0b = {exhi_v2};
           __m256i C0b = {exlo_v3};
           __m256i D0b = {exhi_v3};

           __m256i A1 = _mm256_unpacklo_epi16(A0a, C0a);
           __m256i B1 = _mm256_unpackhi_epi16(A0a, C0a);
           __m256i C1 = _mm256_unpacklo_epi16(B0a, D0a);
           __m256i D1 = _mm256_unpackhi_epi16(B0a, D0a);

           __m256i A2 = _mm256_unpacklo_epi16(A1, C1);
           __m256i B2 = _mm256_unpackhi_epi16(A1, C1);
           __m256i C2 = _mm256_unpacklo_epi16(B1, D1);
           __m256i D2 = _mm256_unpackhi_epi16(B1, D1);

           __m256i A3a = _mm256_unpacklo_epi16(A2, C2);
           __m256i B3a = _mm256_unpackhi_epi16(A2, C2);
           __m256i C3a = _mm256_unpacklo_epi16(B2, D2);
           __m256i D3a = _mm256_unpackhi_epi16(B2, D2);

           A1 = _mm256_unpacklo_epi16(A0b, C0b);
           B1 = _mm256_unpackhi_epi16(A0b, C0b);
           C1 = _mm256_unpacklo_epi16(B0b, D0b);
           D1 = _mm256_unpackhi_epi16(B0b, D0b);

           A2 = _mm256_unpacklo_epi16(A1, C1);
           B2 = _mm256_unpackhi_epi16(A1, C1);
           C2 = _mm256_unpacklo_epi16(B1, D1);
           D2 = _mm256_unpackhi_epi16(B1, D1);

           __m256i A3b = _mm256_unpacklo_epi16(A2, C2);
           __m256i B3b = _mm256_unpackhi_epi16(A2, C2);
           __m256i C3b = _mm256_unpacklo_epi16(B2, D2);
           __m256i D3b = _mm256_unpackhi_epi16(B2, D2);

           ret.v0 = {mergeA};
           ret.v1 = {mergeB};
           ret.v2 = {mergeC};
           ret.v3 = {mergeD};

           return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A3a', 'A3b'),
                                 mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'),
                                 mergeC=x86.setr(simd_ext, typ, 'C3a', 'C3b'),
                                 mergeD=x86.setr(simd_ext, typ, 'D3a', 'D3b'),
                                 **fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;

           {load_v0v1v2v3}

           __m512i WXm = _mm512_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28,
                                           1, 5, 9, 13, 17, 21, 25, 29);
           __m512i YZm = _mm512_setr_epi32(2, 6, 10, 14, 18, 22, 26, 30,
                                           3, 7, 11, 15, 19, 23, 27, 31);
           __m512i Wm = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7,
                                          16, 17, 18, 19, 20, 21, 22, 23);
           __m512i Xm = _mm512_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15,
                                          24, 25, 26, 27, 28, 29, 30, 31);

           {styp} WXa = _mm512_permutex2var{suf}(v0, WXm, v1);
           {styp} WXb = _mm512_permutex2var{suf}(v2, WXm, v3);
           {styp} YZa = _mm512_permutex2var{suf}(v0, YZm, v1);
           {styp} YZb = _mm512_permutex2var{suf}(v2, YZm, v3);

           ret.v0 = _mm512_permutex2var{suf}(WXa, Wm, WXb);
           ret.v1 = _mm512_permutex2var{suf}(WXa, Xm, WXb);
           ret.v2 = _mm512_permutex2var{suf}(YZa, Wm, YZb);
           ret.v3 = _mm512_permutex2var{suf}(YZa, Xm, YZb);

           return ret;'''.format(**fmtspec)
    if typ in ['f64', 'i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_v{typ}x4 ret;

           {load_v0v1v2v3}

           {styp} A1 = _mm512_unpacklo{suf}(v0, v1);
           {styp} B1 = _mm512_unpacklo{suf}(v2, v3);
           {styp} C1 = _mm512_unpackhi{suf}(v0, v1);
           {styp} D1 = _mm512_unpackhi{suf}(v2, v3);

           __m512i A_mask = _mm512_set_epi64(13, 9, 12, 8, 5, 1, 4, 0);
           __m512i B_mask = _mm512_set_epi64(15, 11, 14, 10, 7, 3, 6, 2);

           ret.v0 = _mm512_permutex2var{suf}(A1, A_mask, B1);
           ret.v1 = _mm512_permutex2var{suf}(C1, A_mask, D1);
           ret.v2 = _mm512_permutex2var{suf}(A1, B_mask, B1);
           ret.v3 = _mm512_permutex2var{suf}(C1, B_mask, D1);

           return ret;'''.format(**fmtspec)

###############################################################################

def store4(simd_ext, typ, align, fmtspec2, v0, v1, v2, v3):
    fmtspec = fmtspec2.copy()
    fmtspec['a'] = '' if align else 'u'
    store = '{pre}store{a}{sufsi}'.format(**fmtspec)
    fmtspec['store'] = store
    fmtspec['v0'] = v0
    fmtspec['v1'] = v1
    fmtspec['v2'] = v2
    fmtspec['v3'] = v3
    if typ in ['f32', 'f64']:
        return \
        '''{store}({in0}, {v0});
           {store}({in0} + {le}, {v1});
           {store}({in0} + (2 * {le}), {v2});
           {store}({in0} + (3 * {le}), {v3});'''.format(**fmtspec)
    else:
        return \
        '''{store}(({styp} *){in0}, {v0});
           {store}(({styp} *){in0} + 1, {v1});
           {store}(({styp} *){in0} + 2, {v2});
           {store}(({styp} *){in0} + 3, {v3});'''.format(**fmtspec)

###############################################################################

def store4_sse(typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    if typ in ['i8', 'u8']:
        return \
        '''__m128i A5 = _mm_unpacklo_epi8({in1}, {in3});
           __m128i B5 = _mm_unpackhi_epi8({in1}, {in3});
           __m128i C5 = _mm_unpacklo_epi8({in2}, {in4});
           __m128i D5 = _mm_unpackhi_epi8({in2}, {in4});

           __m128i A6 = _mm_unpacklo_epi8(A5, C5);
           __m128i B6 = _mm_unpackhi_epi8(A5, C5);
           __m128i C6 = _mm_unpacklo_epi8(B5, D5);
           __m128i D6 = _mm_unpackhi_epi8(B5, D5);

           {store}'''.format(store=store4('sse', typ, align, fmtspec,
                                          'A6', 'B6', 'C6', 'D6'), **fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''__m128i Q = _mm_unpacklo_epi16({in1}, {in2});
           __m128i R = _mm_unpackhi_epi16({in1}, {in2});
           __m128i S = _mm_unpacklo_epi16({in3}, {in4});
           __m128i T = _mm_unpackhi_epi16({in3}, {in4});

           __m128i U = _mm_unpacklo_epi32(Q, S);
           __m128i V = _mm_unpackhi_epi32(Q, S);
           __m128i W = _mm_unpacklo_epi32(R, T);
           __m128i X = _mm_unpackhi_epi32(R, T);

           {store}'''.format(store=store4('sse', typ, align, fmtspec,
                                          'U', 'V', 'W', 'X'), **fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''{styp} A3 = _mm_unpacklo{suf}({in1}, {in3});
           {styp} B3 = _mm_unpackhi{suf}({in1}, {in3});
           {styp} C3 = _mm_unpacklo{suf}({in2}, {in4});
           {styp} D3 = _mm_unpackhi{suf}({in2}, {in4});

           {styp} A4 = _mm_unpacklo{suf}(A3, C3);
           {styp} B4 = _mm_unpackhi{suf}(A3, C3);
           {styp} C4 = _mm_unpacklo{suf}(B3, D3);
           {styp} D4 = _mm_unpackhi{suf}(B3, D3);

           {store}'''.format(store=store4('sse', typ, align, fmtspec,
                                          'A4', 'B4', 'C4', 'D4'), **fmtspec)
    if typ in ['f64', 'u64', 'i64']:
        return \
        '''{styp} A0 = _mm_unpacklo{suf}({in1}, {in2});
           {styp} B0 = _mm_unpacklo{suf}({in3}, {in4});
           {styp} C0 = _mm_unpackhi{suf}({in1}, {in2});
           {styp} D0 = _mm_unpackhi{suf}({in3}, {in4});
           {store}'''.format(store=store4('sse', typ, align, fmtspec,
                                          'A0', 'B0', 'C0', 'D0'), **fmtspec)

###############################################################################

def store4_avx(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['exlo_in1'] = x86.extract('avx', typ, x86.LO, common.in1)
    fmtspec['exhi_in1'] = x86.extract('avx', typ, x86.HI, common.in1)
    fmtspec['exlo_in2'] = x86.extract('avx', typ, x86.LO, common.in2)
    fmtspec['exhi_in2'] = x86.extract('avx', typ, x86.HI, common.in2)
    fmtspec['exlo_in3'] = x86.extract('avx', typ, x86.LO, common.in3)
    fmtspec['exhi_in3'] = x86.extract('avx', typ, x86.HI, common.in3)
    fmtspec['exlo_in4'] = x86.extract('avx', typ, x86.LO, common.in4)
    fmtspec['exhi_in4'] = x86.extract('avx', typ, x86.HI, common.in4)
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'avx2':
            return \
            '''__m256i A1 = _mm256_unpacklo_epi8({in1}, {in3});
               __m256i B1 = _mm256_unpackhi_epi8({in1}, {in3});
               __m256i C1 = _mm256_unpacklo_epi8({in2}, {in4});
               __m256i D1 = _mm256_unpackhi_epi8({in2}, {in4});

               __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));
               __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));
               __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));
               __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));

               __m256i A = _mm256_unpacklo_epi8(A2, C2);
               __m256i B = _mm256_unpacklo_epi8(B2, D2);
               __m256i C = _mm256_unpackhi_epi8(A2, C2);
               __m256i D = _mm256_unpackhi_epi8(B2, D2);

               {store}'''.format(store=store4('avx', typ, align, fmtspec,
                                 'A', 'B', 'C', 'D'), **fmtspec)
        else:
            return \
            '''__m128i Wa = {exlo_in1};
               __m128i Wb = {exhi_in1};
               __m128i Xa = {exlo_in2};
               __m128i Xb = {exhi_in2};
               __m128i Ya = {exlo_in3};
               __m128i Yb = {exhi_in3};
               __m128i Za = {exlo_in4};
               __m128i Zb = {exhi_in4};

               __m128i AA = _mm_unpacklo_epi8(Wa, Ya);
               __m128i BB = _mm_unpackhi_epi8(Wa, Ya);
               __m128i CC = _mm_unpacklo_epi8(Xa, Za);
               __m128i DD = _mm_unpackhi_epi8(Xa, Za);

               __m128i A0 = _mm_unpacklo_epi8(AA, CC);
               __m128i B0 = _mm_unpackhi_epi8(AA, CC);
               __m128i C0 = _mm_unpacklo_epi8(BB, DD);
               __m128i D0 = _mm_unpackhi_epi8(BB, DD);

               AA = _mm_unpacklo_epi8(Wb, Yb);
               BB = _mm_unpackhi_epi8(Wb, Yb);
               CC = _mm_unpacklo_epi8(Xb, Zb);
               DD = _mm_unpackhi_epi8(Xb, Zb);

               __m128i A1 = _mm_unpacklo_epi8(AA, CC);
               __m128i B1 = _mm_unpackhi_epi8(AA, CC);
               __m128i C1 = _mm_unpacklo_epi8(BB, DD);
               __m128i D1 = _mm_unpackhi_epi8(BB, DD);

               __m256i A = {mergeAB0};
               __m256i B = {mergeCD0};
               __m256i C = {mergeAB1};
               __m256i D = {mergeCD1};

               {store}'''.format(mergeAB0=x86.setr('avx', typ, 'A0', 'B0'),
                                 mergeCD0=x86.setr('avx', typ, 'C0', 'D0'),
                                 mergeAB1=x86.setr('avx', typ, 'A1', 'B1'),
                                 mergeCD1=x86.setr('avx', typ, 'C1', 'D1'),
                                 store=store4('avx', typ, align, fmtspec,
                                         'A', 'B', 'C', 'D'), **fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'avx2':
            return \
            '''__m256i A3 = _mm256_unpacklo_epi16({in1}, {in3});
               __m256i B3 = _mm256_unpackhi_epi16({in1}, {in3});
               __m256i C3 = _mm256_unpacklo_epi16({in2}, {in4});
               __m256i D3 = _mm256_unpackhi_epi16({in2}, {in4});

               __m256i A = _mm256_unpacklo_epi16(A3, C3);
               __m256i B = _mm256_unpackhi_epi16(A3, C3);
               __m256i C = _mm256_unpacklo_epi16(B3, D3);
               __m256i D = _mm256_unpackhi_epi16(B3, D3);

               {store}'''.format(store=store4('avx', typ, align, fmtspec,
                                              'A', 'B', 'C', 'D'), **fmtspec)
        else:
            return \
            '''__m128i Wa = {exlo_in1};
               __m128i Wb = {exhi_in1};
               __m128i Xa = {exlo_in2};
               __m128i Xb = {exhi_in2};
               __m128i Ya = {exlo_in3};
               __m128i Yb = {exhi_in3};
               __m128i Za = {exlo_in4};
               __m128i Zb = {exhi_in4};

               __m128i AA = _mm_unpacklo_epi16(Wa, Ya);
               __m128i BB = _mm_unpackhi_epi16(Wa, Ya);
               __m128i CC = _mm_unpacklo_epi16(Xa, Za);
               __m128i DD = _mm_unpackhi_epi16(Xa, Za);

               __m128i A0 = _mm_unpacklo_epi16(AA, CC);
               __m128i B0 = _mm_unpackhi_epi16(AA, CC);
               __m128i C0 = _mm_unpacklo_epi16(BB, DD);
               __m128i D0 = _mm_unpackhi_epi16(BB, DD);

               AA = _mm_unpacklo_epi16(Wb, Yb);
               BB = _mm_unpackhi_epi16(Wb, Yb);
               CC = _mm_unpacklo_epi16(Xb, Zb);
               DD = _mm_unpackhi_epi16(Xb, Zb);

               __m128i A1 = _mm_unpacklo_epi16(AA, CC);
               __m128i B1 = _mm_unpackhi_epi16(AA, CC);
               __m128i C1 = _mm_unpacklo_epi16(BB, DD);
               __m128i D1 = _mm_unpackhi_epi16(BB, DD);

               __m256i A = {mergeAB0};
               __m256i B = {mergeCD0};
               __m256i C = {mergeAB1};
               __m256i D = {mergeCD1};

               {store}'''.format(mergeAB0=x86.setr('avx', typ, 'A0', 'B0'),
                                 mergeCD0=x86.setr('avx', typ, 'C0', 'D0'),
                                 mergeAB1=x86.setr('avx', typ, 'A1', 'B1'),
                                 mergeCD1=x86.setr('avx', typ, 'C1', 'D1'),
                                 store=store4('avx', typ, align, fmtspec,
                                              'A', 'B', 'C', 'D'), **fmtspec)
    if typ == 'f32':
        return \
        '''__m256 A3 = _mm256_unpacklo_ps({in1}, {in3});
           __m256 B3 = _mm256_unpackhi_ps({in1}, {in3});
           __m256 C3 = _mm256_unpacklo_ps({in2}, {in4});
           __m256 D3 = _mm256_unpackhi_ps({in2}, {in4});

           __m256 A = _mm256_unpacklo_ps(A3, C3);
           __m256 B = _mm256_unpackhi_ps(A3, C3);
           __m256 C = _mm256_unpacklo_ps(B3, D3);
           __m256 D = _mm256_unpackhi_ps(B3, D3);

           {store}'''.format(store=store4('avx', typ, align, fmtspec,
                                          'A', 'B', 'C', 'D'), **fmtspec)
    if typ in ['i32', 'u32']:
        if simd_ext == 'avx2':
            return \
            '''__m256i A3 = _mm256_unpacklo_epi32({in1}, {in3});
               __m256i B3 = _mm256_unpackhi_epi32({in1}, {in3});
               __m256i C3 = _mm256_unpacklo_epi32({in2}, {in4});
               __m256i D3 = _mm256_unpackhi_epi32({in2}, {in4});

               __m256i A = _mm256_unpacklo_epi32(A3, C3);
               __m256i B = _mm256_unpackhi_epi32(A3, C3);
               __m256i C = _mm256_unpacklo_epi32(B3, D3);
               __m256i D = _mm256_unpackhi_epi32(B3, D3);

               {store}'''.format(store=store4('avx', typ, align, fmtspec,
                                 'A', 'B', 'C', 'D'), **fmtspec)
        else:
            return \
            '''nsimd_store4{a}_avx_f32((f32 *){in0},
                                       _mm256_castsi256_ps({in1}),
                                       _mm256_castsi256_ps({in2}),
                                       _mm256_castsi256_ps({in3}),
                                       _mm256_castsi256_ps({in4}));'''. \
               format(**fmtspec)
    if typ == 'f64':
        return \
        '''__m256d A3 = _mm256_permute2f128_pd({in1}, {in3}, 2 << 4);
           __m256d B3 = _mm256_permute2f128_pd({in2}, {in4}, 2 << 4);
           __m256d C3 = _mm256_permute2f128_pd({in1}, {in3}, (3 << 4) | 1);
           __m256d D3 = _mm256_permute2f128_pd({in2}, {in4}, (3 << 4) | 1);

           __m256d A = _mm256_unpacklo_pd(A3, B3);
           __m256d B = _mm256_unpackhi_pd(A3, B3);
           __m256d C = _mm256_unpacklo_pd(C3, D3);
           __m256d D = _mm256_unpackhi_pd(C3, D3);

           {store}'''.format(store=store4('avx', typ, align, fmtspec,
                                          'A', 'B', 'C', 'D'), **fmtspec)

    if typ in ['i64', 'u64']:
        if simd_ext == 'avx2':
            return \
            '''__m256i A3 = _mm256_permute2f128_si256({in1}, {in3}, 2 << 4);
               __m256i B3 = _mm256_permute2f128_si256({in2}, {in4}, 2 << 4);
               __m256i C3 = _mm256_permute2f128_si256(
                              {in1}, {in3}, (3 << 4) | 1);
               __m256i D3 = _mm256_permute2f128_si256(
                              {in2}, {in4}, (3 << 4) | 1);

               __m256i A = _mm256_unpacklo_epi64(A3, B3);
               __m256i B = _mm256_unpackhi_epi64(A3, B3);
               __m256i C = _mm256_unpacklo_epi64(C3, D3);
               __m256i D = _mm256_unpackhi_epi64(C3, D3);

               {store}'''.format(store=store4('avx', typ, align, fmtspec,
                                 'A', 'B', 'C', 'D'), **fmtspec)
        else:
            return \
            '''nsimd_store4{a}_avx_f64((f64 *){in0},
                                       _mm256_castsi256_pd({in1}),
                                       _mm256_castsi256_pd({in2}),
                                       _mm256_castsi256_pd({in3}),
                                       _mm256_castsi256_pd({in4}));'''. \
               format(**fmtspec)

###############################################################################

def store4_avx512(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1)
    fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1)
    fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2)
    fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2)
    fmtspec['exlo_in3'] = x86.extract(simd_ext, typ, x86.LO, common.in3)
    fmtspec['exhi_in3'] = x86.extract(simd_ext, typ, x86.HI, common.in3)
    fmtspec['exlo_in4'] = x86.extract(simd_ext, typ, x86.LO, common.in4)
    fmtspec['exhi_in4'] = x86.extract(simd_ext, typ, x86.HI, common.in4)
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        return \
        '''__m256i A0a = {exlo_in1};
           __m256i A0b = {exhi_in1};
           __m256i B0a = {exlo_in2};
           __m256i B0b = {exhi_in2};
           __m256i C0a = {exlo_in3};
           __m256i C0b = {exhi_in3};
           __m256i D0a = {exlo_in4};
           __m256i D0b = {exhi_in4};

           __m256i A1 = _mm256_unpacklo_epi8(A0a, C0a);
           __m256i B1 = _mm256_unpackhi_epi8(A0a, C0a);
           __m256i C1 = _mm256_unpacklo_epi8(B0a, D0a);
           __m256i D1 = _mm256_unpackhi_epi8(B0a, D0a);

           __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));
           __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));
           __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));
           __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));

           __m256i A3a = _mm256_unpacklo_epi8(A2, C2);
           __m256i B3a = _mm256_unpacklo_epi8(B2, D2);
           __m256i C3a = _mm256_unpackhi_epi8(A2, C2);
           __m256i D3a = _mm256_unpackhi_epi8(B2, D2);

           A1 = _mm256_unpacklo_epi8(A0b, C0b);
           B1 = _mm256_unpackhi_epi8(A0b, C0b);
           C1 = _mm256_unpacklo_epi8(B0b, D0b);
           D1 = _mm256_unpackhi_epi8(B0b, D0b);

           A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));
           B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));
           C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));
           D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));

           __m256i A3b = _mm256_unpacklo_epi8(A2, C2);
           __m256i B3b = _mm256_unpacklo_epi8(B2, D2);
           __m256i C3b = _mm256_unpackhi_epi8(A2, C2);
           __m256i D3b = _mm256_unpackhi_epi8(B2, D2);

           __m512i A = {mergeABa};
           __m512i B = {mergeCDa};
           __m512i C = {mergeABb};
           __m512i D = {mergeCDb};

           {store}'''.format(mergeABa=x86.setr(simd_ext, typ, 'A3a', 'B3a'),
                             mergeCDa=x86.setr(simd_ext, typ, 'C3a', 'D3a'),
                             mergeABb=x86.setr(simd_ext, typ, 'A3b', 'B3b'),
                             mergeCDb=x86.setr(simd_ext, typ, 'C3b', 'D3b'),
                             store=store4(simd_ext, typ, align, fmtspec,
                                          'A', 'B', 'C', 'D'), **fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''__m256i A0a = {exlo_in1};
           __m256i A0b = {exhi_in1};
           __m256i B0a = {exlo_in2};
           __m256i B0b = {exhi_in2};
           __m256i C0a = {exlo_in3};
           __m256i C0b = {exhi_in3};
           __m256i D0a = {exlo_in4};
           __m256i D0b = {exhi_in4};

           __m256i A3 = _mm256_unpacklo_epi16(A0a, C0a);
           __m256i B3 = _mm256_unpackhi_epi16(A0a, C0a);
           __m256i C3 = _mm256_unpacklo_epi16(B0a, D0a);
           __m256i D3 = _mm256_unpackhi_epi16(B0a, D0a);

           __m256i A4a = _mm256_unpacklo_epi16(A3, C3);
           __m256i B4a = _mm256_unpackhi_epi16(A3, C3);
           __m256i C4a = _mm256_unpacklo_epi16(B3, D3);
           __m256i D4a = _mm256_unpackhi_epi16(B3, D3);

           A3 = _mm256_unpacklo_epi16(A0b, C0b);
           B3 = _mm256_unpackhi_epi16(A0b, C0b);
           C3 = _mm256_unpacklo_epi16(B0b, D0b);
           D3 = _mm256_unpackhi_epi16(B0b, D0b);

           __m256i A4b = _mm256_unpacklo_epi16(A3, C3);
           __m256i B4b = _mm256_unpackhi_epi16(A3, C3);
           __m256i C4b = _mm256_unpacklo_epi16(B3, D3);
           __m256i D4b = _mm256_unpackhi_epi16(B3, D3);

           __m512i A = {mergeABa};
           __m512i B = {mergeCDa};
           __m512i C = {mergeABb};
           __m512i D = {mergeCDb};

           {store}'''.format(mergeABa=x86.setr(simd_ext, typ, 'A4a', 'B4a'),
                             mergeCDa=x86.setr(simd_ext, typ, 'C4a', 'D4a'),
                             mergeABb=x86.setr(simd_ext, typ, 'A4b', 'B4b'),
                             mergeCDb=x86.setr(simd_ext, typ, 'C4b', 'D4b'),
                             store=store4(simd_ext, typ, align, fmtspec,
                                          'A', 'B', 'C', 'D'), **fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''__m512i m1 = _mm512_setr_epi32(0, 1, 2, 3, 16, 17, 18, 19,
                                          4, 5, 6, 7, 20, 21, 22, 23);
           __m512i m2 = _mm512_setr_epi32(8, 9, 10, 11, 24, 25, 26, 27,
                                          12, 13, 14, 15, 28, 29, 30, 31);
           __m512i m3 = _mm512_setr_epi32(0, 4, 16, 20, 1, 5, 17, 21,
                                          2, 6, 18, 22, 3, 7, 19, 23);
           __m512i m4 = _mm512_setr_epi32(8, 12, 24, 28, 9, 13, 25, 29,
                                          10, 14, 26, 30, 11, 15, 27, 31);

           {styp} WXa = _mm512_permutex2var{suf}({in1}, m1, {in2});
           {styp} WXb = _mm512_permutex2var{suf}({in1}, m2, {in2});
           {styp} YZa = _mm512_permutex2var{suf}({in3}, m1, {in4});
           {styp} YZb = _mm512_permutex2var{suf}({in3}, m2, {in4});

           {styp} A = _mm512_permutex2var{suf}(WXa, m3, YZa);
           {styp} B = _mm512_permutex2var{suf}(WXa, m4, YZa);
           {styp} C = _mm512_permutex2var{suf}(WXb, m3, YZb);
           {styp} D = _mm512_permutex2var{suf}(WXb, m4, YZb);

           {store}'''.format(store=store4(simd_ext, typ, align, fmtspec,
                                          'A', 'B', 'C', 'D'), **fmtspec)
    if typ in ['f64', 'i64', 'u64']:
        return \
        '''__m512i A_mask = _mm512_setr_epi64(0, 1,  2,  3,  8,  9, 10, 11);
           __m512i B_mask = _mm512_setr_epi64(4, 5,  6,  7, 12, 13, 14, 15);
           __m512i C_mask = _mm512_setr_epi64(0, 4,  8, 12,  1,  5,  9, 13);
           __m512i D_mask = _mm512_setr_epi64(2, 6, 10, 14,  3,  7, 11, 15);

           {styp} A1 = _mm512_permutex2var{suf}({in1}, A_mask, {in2});
           {styp} B1 = _mm512_permutex2var{suf}({in1}, B_mask, {in2});
           {styp} C1 = _mm512_permutex2var{suf}({in3}, A_mask, {in4});
           {styp} D1 = _mm512_permutex2var{suf}({in3}, B_mask, {in4});

           {styp} A = _mm512_permutex2var{suf}(A1, C_mask, C1);
           {styp} B = _mm512_permutex2var{suf}(A1, D_mask, C1);
           {styp} C = _mm512_permutex2var{suf}(B1, C_mask, D1);
           {styp} D = _mm512_permutex2var{suf}(B1, D_mask, D1);

           {store}'''.format(store=store4(simd_ext, typ, align, fmtspec,
                                          'A', 'B', 'C', 'D'), **fmtspec)

###############################################################################

def get_load_v0v1v2(simd_ext, typ, align, fmtspec):
    load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec)
    if typ in ['f32', 'f64']:
        return '''{styp} v0 = {load}(a0);
                  {styp} v1 = {load}(a0 + {le});
                  {styp} v2 = {load}(a0 + (2 * {le}));'''. \
                  format(load=load, **fmtspec)
    else:
        return '''{styp} v0 = {load}(({styp}*)a0);
                  {styp} v1 = {load}(({styp}*)a0 + 1);
                  {styp} v2 = {load}(({styp}*)a0 + 2);'''. \
                  format(load=load, **fmtspec)

###############################################################################

def load3_sse(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1v2'] = get_load_v0v1v2('sse', typ, align, fmtspec)
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'sse42':
            return \
            '''nsimd_sse42_v{typ}x3 ret;
               {load_v0v1v2}

               __m128i A1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                              -1, -1, 15, 12,  9,  6,  3,  0);
               __m128i A2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11,  8,
                                               5,  2, -1, -1, -1, -1, -1, -1);
               __m128i A3_mask = _mm_set_epi8(13, 10,  7,  4,  1, -1, -1, -1,
                                              -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i A4 = _mm_shuffle_epi8(v0, A1_mask);
               __m128i A5 = _mm_shuffle_epi8(v1, A2_mask);
               __m128i A6 = _mm_shuffle_epi8(v2, A3_mask);
               A4 = _mm_or_si128(A4, A5);
               ret.v0 = _mm_or_si128(A4, A6);

               __m128i B1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                              -1, -1, -1, 13, 10,  7,  4,  1);
               __m128i B2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12,  9,
                                               6,  3,  0, -1, -1, -1, -1, -1);
               __m128i B3_mask = _mm_set_epi8(14, 11,  8,  5,  2, -1, -1, -1,
                                              -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i B4 = _mm_shuffle_epi8(v0, B1_mask);
               __m128i B5 = _mm_shuffle_epi8(v1, B2_mask);
               __m128i B6 = _mm_shuffle_epi8(v2, B3_mask);
               B4 = _mm_or_si128(B4, B5);
               ret.v1 = _mm_or_si128(B4, B6);

               __m128i C1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                              -1, -1, -1, 14, 11,  8,  5,  2);
               __m128i C2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10,
                                               7,  4,  1, -1, -1, -1, -1, -1);
               __m128i C3_mask = _mm_set_epi8(15, 12,  9,  6,  3,  0, -1, -1,
                                              -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i C4 = _mm_shuffle_epi8(v0, C1_mask);
               __m128i C5 = _mm_shuffle_epi8(v1, C2_mask);
               __m128i C6 = _mm_shuffle_epi8(v2, C3_mask);
               C4 = _mm_or_si128(C4, C5);
               ret.v2 = _mm_or_si128(C4, C6);

               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_sse2_v{typ}x3 ret;
               {load_v0v1v2}

               __m128i A0 = v0;
               __m128i B0 = v1;
               __m128i C0 = v2;
               int k;

               for (k = 0; k < 4; ++k) {{
                 __m128d B0_pd = _mm_castsi128_pd(B0);
                 __m128d C0_pd = _mm_castsi128_pd(C0);

                 __m128d B1_pd = _mm_shuffle_pd(B0_pd, B0_pd, 1);
                 __m128d C2_pd = _mm_shuffle_pd(C0_pd, C0_pd, 1);

                 __m128i B1 = _mm_castpd_si128(B1_pd);
                 __m128i C2 = _mm_castpd_si128(C2_pd);

                 __m128i B3 = _mm_unpackhi_epi8(A0, C2);
                 __m128i A4 = _mm_unpacklo_epi8(A0, B1);
                 __m128i C5 = _mm_unpackhi_epi8(B1, C0);
                 A0 = A4;
                 B0 = B3;
                 C0 = C5;
               }}
               ret.v0 = A0;
               ret.v1 = B0;
               ret.v2 = C0;
               return ret;'''.format(**fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;

           {load_v0v1v2}

           int k;

           for (k = 0; k < 3; ++k) {{
             __m128d B1_pd = _mm_castsi128_pd(v1);
             __m128d C1_pd = _mm_castsi128_pd(v2);
             __m128d B2_pd = _mm_shuffle_pd(B1_pd, B1_pd, 1);
             __m128d C3_pd = _mm_shuffle_pd(C1_pd, C1_pd, 1);
             __m128i B2 = _mm_castpd_si128(B2_pd);
             __m128i C3 = _mm_castpd_si128(C3_pd);

             __m128i B4 = _mm_unpackhi_epi16(v0, C3);
             __m128i A5 = _mm_unpacklo_epi16(v0, B2);
             __m128i C7 = _mm_unpackhi_epi16(B2, v2);

             v0 = A5;
             v1 = B4;
             v2 = C7;
           }}
           ret.v0 = v0;
           ret.v1 = v1;
           ret.v2 = v2;
           return ret;'''.format(**fmtspec)
    if typ == 'f32':
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;
           {load_v0v1v2}

           __m128 A1 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3,2,1,0));
           __m128 B2 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,2,1,0));
           __m128 C3 = _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3,2,1,0));

           ret.v0 = _mm_shuffle_ps(v0, A1, _MM_SHUFFLE(1,2,3,0));
           __m128 B5 = _mm_shuffle_ps(B2, v1, _MM_SHUFFLE(0,3,2,1));
           ret.v2 = _mm_shuffle_ps(C3, v2, _MM_SHUFFLE(3,0,1,2));

           ret.v1 = _mm_shuffle_ps(B5, B5, _MM_SHUFFLE(1,2,3,0));

           return ret;'''.format(**fmtspec)
    if typ in ['i32', 'u32']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;
           nsimd_{simd_ext}_vf32x3 retf32 =
               nsimd_load3{a}_{simd_ext}_f32((f32 *){in0});
           ret.v0 = _mm_castps_si128(retf32.v0);
           ret.v1 = _mm_castps_si128(retf32.v1);
           ret.v2 = _mm_castps_si128(retf32.v2);
           return ret;'''.format(**fmtspec)
    if typ == 'f64':
        return \
        '''nsimd_{simd_ext}_vf64x3 ret;
           {load_v0v1v2}
           ret.v0 = _mm_shuffle_pd(v0, v1, 2);
           ret.v1 = _mm_shuffle_pd(v0, v2, 1);
           ret.v2 = _mm_shuffle_pd(v1, v2, 2);
           return ret;'''.format(**fmtspec)
    if typ in ['i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;
           nsimd_{simd_ext}_vf64x3 retf64 =
               nsimd_load3{a}_{simd_ext}_f64((f64 *){in0});
           ret.v0 = _mm_castpd_si128(retf64.v0);
           ret.v1 = _mm_castpd_si128(retf64.v1);
           ret.v2 = _mm_castpd_si128(retf64.v2);
           return ret;'''.format(**fmtspec)

###############################################################################

def store3(simd_ext, typ, align, fmtspec2, v0, v1, v2):
    fmtspec = fmtspec2.copy()
    fmtspec['a'] = '' if align else 'u'
    store = '{pre}store{a}{sufsi}'.format(**fmtspec)
    fmtspec['store'] = store
    fmtspec['v0'] = v0
    fmtspec['v1'] = v1
    fmtspec['v2'] = v2
    if typ in ['f32', 'f64']:
        return \
        '''{store}({in0}, {v0});
           {store}({in0} + {le}, {v1});
           {store}({in0} + (2 * {le}), {v2});'''.format(**fmtspec)
    else:
        return \
        '''{store}(({styp} *){in0}, {v0});
           {store}(({styp} *){in0} + 1, {v1});
           {store}(({styp} *){in0} + 2, {v2});'''.format(**fmtspec)

###############################################################################

def store3_sse(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'sse42':
            return \
            '''__m128i A1_mask = _mm_set_epi8( 5, -1, -1,  4, -1, -1,  3, -1,
                                              -1,  2, -1, -1,  1, -1, -1,  0);
               __m128i A2_mask = _mm_set_epi8(-1, -1,  4, -1, -1,  3, -1, -1,
                                               2, -1, -1,  1, -1, -1,  0, -1);
               __m128i A3_mask = _mm_set_epi8(-1,  4, -1, -1,  3, -1, -1,  2,
                                              -1, -1,  1, -1, -1,  0, -1, -1);
               __m128i A4 = _mm_shuffle_epi8({in1}, A1_mask);
               __m128i A5 = _mm_shuffle_epi8({in2}, A2_mask);
               __m128i A6 = _mm_shuffle_epi8({in3}, A3_mask);
               A4 = _mm_or_si128(A4, A5);
               A4 = _mm_or_si128(A4, A6);

               __m128i B1_mask = _mm_set_epi8(-1, 10, -1, -1,  9, -1, -1,  8,
                                              -1, -1,  7, -1, -1,  6, -1, -1);
               __m128i B2_mask = _mm_set_epi8(10, -1, -1,  9, -1, -1,  8, -1,
                                              -1,  7, -1, -1,  6, -1, -1,  5);
               __m128i B3_mask = _mm_set_epi8(-1, -1,  9, -1, -1,  8, -1, -1,
                                               7, -1, -1,  6, -1, -1,  5, -1);
               __m128i B4 = _mm_shuffle_epi8({in1}, B1_mask);
               __m128i B5 = _mm_shuffle_epi8({in2}, B2_mask);
               __m128i B6 = _mm_shuffle_epi8({in3}, B3_mask);
               B4 = _mm_or_si128(B4, B5);
               B4 = _mm_or_si128(B4, B6);

               __m128i C1_mask = _mm_set_epi8(-1, -1, 15, -1, -1, 14, -1, -1,
                                              13, -1, -1, 12, -1, -1, 11, -1);
               __m128i C2_mask = _mm_set_epi8(-1, 15, -1, -1, 14, -1, -1, 13,
                                              -1, -1, 12, -1, -1, 11, -1, -1);
               __m128i C3_mask = _mm_set_epi8(15, -1, -1, 14, -1, -1, 13, -1,
                                              -1, 12, -1, -1, 11, -1, -1, 10);
               __m128i C4 = _mm_shuffle_epi8({in1}, C1_mask);
               __m128i C5 = _mm_shuffle_epi8({in2}, C2_mask);
               __m128i C6 = _mm_shuffle_epi8({in3}, C3_mask);
               C4 = _mm_or_si128(C4, C5);
               C4 = _mm_or_si128(C4, C6);

               {store4}'''.format(store4=store3('sse', typ, align, fmtspec,
                                  'A4', 'B4', 'C4'), **fmtspec)
        else:
            return \
            '''__m128i A0 = {in1};
               __m128i B0 = {in2};
               __m128i C0 = {in3};
               int k;

               for (k = 0; k < 4; ++k) {{
                 __m128i A1 = _mm_unpacklo_epi8(A0, B0);
                 __m128i A2 = _mm_unpackhi_epi8(A0, B0);
                 __m128i A3 = _mm_unpacklo_epi8(A1, A2);
                 __m128i A4 = _mm_unpackhi_epi8(A1, A2);
                 __m128i A5 = _mm_unpacklo_epi8(A3, A4);
                 __m128i A6 = _mm_unpackhi_epi8(A3, A4);
                 __m128i A7 = _mm_unpacklo_epi8(A5, A6);
                 __m128i B8 = _mm_unpackhi_epi8(A5, A6);

                 __m128i C9  = _mm_castpd_si128(_mm_shuffle_pd(
                                   _mm_castsi128_pd(C0),
                                   _mm_castsi128_pd(C0), 1));
                 __m128i C10 = _mm_unpacklo_epi8(C0, C9);
                 __m128i C11 = _mm_castpd_si128(_mm_shuffle_pd(
                                   _mm_castsi128_pd(C10),
                                   _mm_castsi128_pd(C10), 1));
                 __m128i C12 = _mm_unpacklo_epi8(C10, C11);
                 __m128i C13 = _mm_castpd_si128(_mm_shuffle_pd(
                                   _mm_castsi128_pd(C12),
                                   _mm_castsi128_pd(C12), 1));
                 __m128i C14 = _mm_unpacklo_epi8(C12, C13);

                 __m128i B15 = _mm_castpd_si128(_mm_shuffle_pd(
                                   _mm_castsi128_pd(C14),
                                   _mm_castsi128_pd(B8), 0));
                 __m128i C16 = _mm_castpd_si128(_mm_shuffle_pd(
                                   _mm_castsi128_pd(B8),
                                   _mm_castsi128_pd(C14), 3));

                 A0 = A7;
                 B0 = B15;
                 C0 = C16;
               }}
               {store0}'''.format(store0=store3('sse', typ, align, fmtspec,
                                  'A0', 'B0', 'C0'), **fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'avx2':
            return \
            '''__m128i A0 = {in1};
               __m128i B0 = {in2};
               __m128i C0 = {in3};

               __m128i A1_mask = _mm_set_epi8(-1, -1,  5,  4, -1, -1, -1, -1,
                                               3,  2, -1, -1, -1, -1,  1,  0);
               __m128i A2_mask = _mm_set_epi8( 5,  4, -1, -1, -1, -1,  3,  2,
                                              -1, -1, -1, -1,  1,  0, -1, -1);
               __m128i A3_mask = _mm_set_epi8(-1, -1, -1, -1,  3,  2, -1, -1,
                                              -1, -1,  1,  0, -1, -1, -1, -1);
               __m128i A4 = _mm_shuffle_epi8(A0, A1_mask);
               __m128i A5 = _mm_shuffle_epi8(B0, A2_mask);
               __m128i A6 = _mm_shuffle_epi8(C0, A3_mask);
               A4 = _mm_or_si128(A4, A5);
               A4 = _mm_or_si128(A4, A6);

               __m128i B1_mask = _mm_set_epi8(11, 10, -1, -1, -1, -1,  9,  8,
                                              -1, -1, -1, -1,  7,  6, -1, -1);
               __m128i B2_mask = _mm_set_epi8(-1, -1, -1, -1,  9,  8, -1, -1,
                                              -1, -1,  7,  6, -1, -1, -1, -1);
               __m128i B3_mask = _mm_set_epi8(-1, -1,  9,  8, -1, -1, -1, -1,
                                               7,  6, -1, -1, -1, -1,  5,  4);
               __m128i B4 = _mm_shuffle_epi8(A0, B1_mask);
               __m128i B5 = _mm_shuffle_epi8(B0, B2_mask);
               __m128i B6 = _mm_shuffle_epi8(C0, B3_mask);
               B4 = _mm_or_si128(B4, B5);
               B4 = _mm_or_si128(B4, B6);

               __m128i C1_mask = _mm_set_epi8(-1, -1, -1, -1, 15, 14, -1, -1,
                                              -1, -1, 13, 12, -1, -1, -1, -1);
               __m128i C2_mask = _mm_set_epi8(-1, -1, 15, 14, -1, -1, -1, -1,
                                              13, 12, -1, -1, -1, -1, 11, 10);
               __m128i C3_mask = _mm_set_epi8(15, 14, -1, -1, -1, -1, 13, 12,
                                              -1, -1, -1, -1, 11, 10, -1, -1);
               __m128i C4 = _mm_shuffle_epi8(A0, C1_mask);
               __m128i C5 = _mm_shuffle_epi8(B0, C2_mask);
               __m128i C6 = _mm_shuffle_epi8(C0, C3_mask);
               C4 = _mm_or_si128(C4, C5);
               C4 = _mm_or_si128(C4, C6);

               {store4};'''.format(store4=store3('sse', typ, align, fmtspec,
                                   'A4', 'B4', 'C4'), **fmtspec)
        else:
            return \
            '''__m128i A0 = {in1};
               __m128i B0 = {in2};
               __m128i C0 = {in3};
               int k;

               for (k = 0; k < 3; ++k) {{
                 __m128i A1 = _mm_shufflelo_epi16(A0, _MM_SHUFFLE(3, 1, 2, 0));
                 __m128i A2 = _mm_shufflehi_epi16(A1, _MM_SHUFFLE(3, 1, 2, 0));
                 __m128i B3 = _mm_shufflelo_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0));
                 __m128i B4 = _mm_shufflehi_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0));
                 __m128i C5 = _mm_shufflelo_epi16(C0, _MM_SHUFFLE(3, 1, 2, 0));
                 __m128i C6 = _mm_shufflehi_epi16(C5, _MM_SHUFFLE(3, 1, 2, 0));

                 __m128 A2_ps = _mm_castsi128_ps(A2);
                 __m128 B4_ps = _mm_castsi128_ps(B4);
                 __m128 C6_ps = _mm_castsi128_ps(C6);

                 __m128 A0_ps = _mm_shuffle_ps(A2_ps, B4_ps,
                                               _MM_SHUFFLE(2, 0, 2, 0));
                 __m128 B0_ps = _mm_shuffle_ps(C6_ps, A2_ps,
                                               _MM_SHUFFLE(3, 1, 2, 0));
                 __m128 C0_ps = _mm_shuffle_ps(B4_ps, C6_ps,
                                               _MM_SHUFFLE(3, 1, 3, 1));

                 A0 = _mm_castps_si128(A0_ps);
                 B0 = _mm_castps_si128(B0_ps);
                 C0 = _mm_castps_si128(C0_ps);
               }}

               {store0}'''.format(store0=store3('sse', typ, align, fmtspec,
                                  'A0', 'B0', 'C0'), **fmtspec)
    if typ == 'f32':
        return \
        '''__m128 A1 = _mm_shuffle_ps({in1}, {in2}, _MM_SHUFFLE(2,0,2,0));
           __m128 B2 = _mm_shuffle_ps({in3}, {in1}, _MM_SHUFFLE(3,1,2,0));
           __m128 C3 = _mm_shuffle_ps({in2}, {in3}, _MM_SHUFFLE(3,1,3,1));

           __m128 A4 = _mm_shuffle_ps(A1, B2, _MM_SHUFFLE(2,0,2,0));
           __m128 B5 = _mm_shuffle_ps(C3, A1, _MM_SHUFFLE(3,1,2,0));
           __m128 C6 = _mm_shuffle_ps(B2, C3, _MM_SHUFFLE(3,1,3,1));

           {store};'''. \
           format(store=store3('sse', typ, align, fmtspec, 'A4', 'B5', 'C6'),
                  **fmtspec)
    if typ in ['i32', 'u32']:
        return \
        '''nsimd_store3{a}_{simd_ext}_f32((f32 *){in0},
                                          _mm_castsi128_ps({in1}),
                                          _mm_castsi128_ps({in2}),
                                          _mm_castsi128_ps({in3}));'''. \
                                          format(**fmtspec)
    if typ == 'f64':
        return \
        '''__m128d A0 = _mm_unpacklo_pd({in1}, {in2});
           __m128d B0 = _mm_shuffle_pd({in3}, {in1}, 2);
           __m128d C0 = _mm_unpackhi_pd({in2}, {in3});
           {store}'''. \
           format(store=store3('sse', typ, align, fmtspec, 'A0', 'B0', 'C0'),
                  **fmtspec)
    if typ in ['i64', 'u64']:
        return \
        '''nsimd_store3{a}_{simd_ext}_f64((f64 *){in0},
                                          _mm_castsi128_pd({in1}),
                                          _mm_castsi128_pd({in2}),
                                          _mm_castsi128_pd({in3}));'''. \
                                          format(**fmtspec)

###############################################################################

def load3_avx(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1v2'] = get_load_v0v1v2('avx', typ, align, fmtspec)
    fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0')
    fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0')
    fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1')
    fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1')
    fmtspec['exlo_v2'] = x86.extract('avx', typ, x86.LO, 'v2')
    fmtspec['exhi_v2'] = x86.extract('avx', typ, x86.HI, 'v2')
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x3 ret;
               {load_v0v1v2}

               __m256i ARmask = _mm256_setr_epi8( 0,  3,  6,  9, 12, 15, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1,  2,  5,
                                                  8, 11, 14, -1, -1, -1, -1, -1);
               __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1,  1,  4,  7, 10, 13,
                                                  0,  3,  6,  9, 12, 15, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1);
               __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  5,
                                                  8, 11, 14, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1,  1,  4,  7, 10, 13);

               __m256i AR = _mm256_shuffle_epi8(v0, ARmask);
               __m256i BR = _mm256_shuffle_epi8(v1, BRmask);
               __m256i CR = _mm256_shuffle_epi8(v2, CRmask);
               __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);

               __m256i R0 = _mm256_or_si256(AR, BR);
               __m256i R1 = _mm256_or_si256(BR, CR);
               __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);
               ret.v0 = _mm256_or_si256(DR, R2);


               __m256i AGmask = _mm256_setr_epi8( 1,  4,  7, 10, 13, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1,  0,  3,  6,
                                                  9, 12, 15, -1, -1, -1, -1, -1);
               __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1,  2,  5,  8, 11, 14,
                                                  1,  4,  7, 10, 13, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1);
               __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  0,  3,  6,
                                                  9, 12, 15, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1,  2,  5,  8, 11, 14);

               __m256i AG = _mm256_shuffle_epi8(v0, AGmask);
               __m256i BG = _mm256_shuffle_epi8(v1, BGmask);
               __m256i CG = _mm256_shuffle_epi8(v2, CGmask);
               __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);

               __m256i G0 = _mm256_or_si256(AG, BG);
               __m256i G1 = _mm256_or_si256(BG, CG);
               __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);
               ret.v1 = _mm256_or_si256(DG, G2);

               __m256i ABmask = _mm256_setr_epi8( 2,  5,  8, 11, 14, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1,  1,  4,  7,
                                                 10, 13, -1, -1, -1, -1, -1, -1);
               __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1,  0,  3,  6,  9, 12, 15,
                                                  2,  5,  8, 11, 14, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1);
               __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  1,  4,  7,
                                                 10, 13, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1,  0,  3,  6,  9, 12, 15);

               __m256i AB = _mm256_shuffle_epi8(v0, ABmask);
               __m256i BB = _mm256_shuffle_epi8(v1, BBmask);
               __m256i CB = _mm256_shuffle_epi8(v2, CBmask);
               __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);

               __m256i B0 = _mm256_or_si256(AB, BB);
               __m256i B1 = _mm256_or_si256(BB, CB);
               __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);
               ret.v2 = _mm256_or_si256(DB, B2);
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x3 ret;
               {load_v0v1v2}

               __m128i Aa = {exlo_v0};
               __m128i Ba = {exhi_v0};
               __m128i Ca = {exlo_v1};
               __m128i Ab = {exhi_v1};
               __m128i Bb = {exlo_v2};
               __m128i Cb = {exhi_v2};

               __m128i ARm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                          -1, -1, 15, 12,  9,  6,  3,  0);
               __m128i BRm = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11,  8,
                                           5,  2, -1, -1, -1, -1, -1, -1);
               __m128i CRm = _mm_set_epi8(13, 10,  7,  4,  1, -1, -1, -1,
                                          -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i AR = _mm_shuffle_epi8(Aa, ARm);
               __m128i BR = _mm_shuffle_epi8(Ba, BRm);
               __m128i CR = _mm_shuffle_epi8(Ca, CRm);
               __m128i R0 = _mm_or_si128(AR, BR);
               R0 = _mm_or_si128(R0, CR);

               AR = _mm_shuffle_epi8(Ab, ARm);
               BR = _mm_shuffle_epi8(Bb, BRm);
               CR = _mm_shuffle_epi8(Cb, CRm);
               __m128i R1 = _mm_or_si128(AR, BR);
               R1 = _mm_or_si128(R1, CR);

               __m128i AGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                          -1, -1, -1, 13, 10,  7,  4,  1);
               __m128i BGm = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12,  9,
                                           6,  3,  0, -1, -1, -1, -1, -1);
               __m128i CGm = _mm_set_epi8(14, 11,  8,  5,  2, -1, -1, -1,
                                          -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i AG = _mm_shuffle_epi8(Aa, AGm);
               __m128i BG = _mm_shuffle_epi8(Ba, BGm);
               __m128i CG = _mm_shuffle_epi8(Ca, CGm);
               __m128i G0 = _mm_or_si128(AG, BG);
               G0 = _mm_or_si128(G0, CG);

               AG = _mm_shuffle_epi8(Ab, AGm);
               BG = _mm_shuffle_epi8(Bb, BGm);
               CG = _mm_shuffle_epi8(Cb, CGm);
               __m128i G1 = _mm_or_si128(AG, BG);
               G1 = _mm_or_si128(G1, CG);

               __m128i ABm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                          -1, -1, -1, 14, 11,  8,  5,  2);
               __m128i BBm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10,
                                           7,  4,  1, -1, -1, -1, -1, -1);
               __m128i CBm = _mm_set_epi8(15, 12,  9,  6,  3,  0, -1, -1,
                                          -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i AB = _mm_shuffle_epi8(Aa, ABm);
               __m128i BB = _mm_shuffle_epi8(Ba, BBm);
               __m128i CB = _mm_shuffle_epi8(Ca, CBm);
               __m128i B0 = _mm_or_si128(AB, BB);
               B0 = _mm_or_si128(B0, CB);

               AB = _mm_shuffle_epi8(Ab, ABm);
               BB = _mm_shuffle_epi8(Bb, BBm);
               CB = _mm_shuffle_epi8(Cb, CBm);
               __m128i B1 = _mm_or_si128(AB, BB);
               B1 = _mm_or_si128(B1, CB);

               ret.v0 = {mergeR};
               ret.v1 = {mergeG};
               ret.v2 = {mergeB};

               return ret;'''.format(mergeR=x86.setr('avx', typ, 'R0', 'R1'),
                                     mergeG=x86.setr('avx', typ, 'G0', 'G1'),
                                     mergeB=x86.setr('avx', typ, 'B0', 'B1'),
                                     **fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'avx2':
            return \
            '''nsimd_avx2_v{typ}x3 ret;
               {load_v0v1v2}
               __m256i ARmask = _mm256_setr_epi8( 0,  1,  6,  7, 12, 13, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1,  2,  3,
                                                  8,  9, 14, 15, -1, -1, -1, -1);
               __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1,  4,  5, 10, 11,
                                                  0,  1,  6,  7, 12, 13, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1);
               __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  3,
                                                  8,  9, 14, 15, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1,  4,  5, 10, 11);

               __m256i AR = _mm256_shuffle_epi8(v0, ARmask);
               __m256i BR = _mm256_shuffle_epi8(v1, BRmask);
               __m256i CR = _mm256_shuffle_epi8(v2, CRmask);
               __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);

               __m256i R0 = _mm256_or_si256(AR, BR);
               __m256i R1 = _mm256_or_si256(BR, CR);
               __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);
               ret.v0 = _mm256_or_si256(DR, R2);


               __m256i AGmask = _mm256_setr_epi8( 2,  3,  8,  9, 14, 15, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1,  4,  5,
                                                 10, 11, -1, -1, -1, -1, -1, -1);
               __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1,  0,  1,  6,  7, 12, 13,
                                                  2,  3,  8,  9, 14, 15, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1);
               __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  4,  5,
                                                 10, 11, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1,  0,  1,  6,  7, 12, 13);

               __m256i AG = _mm256_shuffle_epi8(v0, AGmask);
               __m256i BG = _mm256_shuffle_epi8(v1, BGmask);
               __m256i CG = _mm256_shuffle_epi8(v2, CGmask);
               __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);

               __m256i G0 = _mm256_or_si256(AG, BG);
               __m256i G1 = _mm256_or_si256(BG, CG);
               __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);
               ret.v1 = _mm256_or_si256(DG, G2);

               __m256i ABmask = _mm256_setr_epi8( 4,  5, 10, 11, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1,  0,  1,  6,  7,
                                                 12, 13, -1, -1, -1, -1, -1, -1);
               __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1,  2,  3,  8,  9, 14, 15,
                                                  4,  5, 10, 11, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1);
               __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1,  6,  7,
                                                 12, 13, -1, -1, -1, -1, -1, -1,
                                                 -1, -1, -1, -1, -1, -1, -1, -1,
                                                 -1, -1,  2,  3,  8,  9, 14, 15);

               __m256i AB = _mm256_shuffle_epi8(v0, ABmask);
               __m256i BB = _mm256_shuffle_epi8(v1, BBmask);
               __m256i CB = _mm256_shuffle_epi8(v2, CBmask);
               __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);

               __m256i B0 = _mm256_or_si256(AB, BB);
               __m256i B1 = _mm256_or_si256(BB, CB);
               __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);
               ret.v2 = _mm256_or_si256(DB, B2);
               return ret;'''.format(**fmtspec)
        else:
            return \
            '''nsimd_avx_v{typ}x3 ret;
               {load_v0v1v2}

               __m128i Aa = {exlo_v0};
               __m128i Ba = {exhi_v0};
               __m128i Ca = {exlo_v1};
               __m128i Ab = {exhi_v1};
               __m128i Bb = {exlo_v2};
               __m128i Cb = {exhi_v2};

               __m128i ARm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                          -1, -1, 13, 12,  7,  6,  1,  0);
               __m128i BRm = _mm_set_epi8(-1, -1, -1, -1, 15, 14,  9,  8,
                                           3,  2, -1, -1, -1, -1, -1, -1);
               __m128i CRm = _mm_set_epi8(11, 10,  5,  4, -1, -1, -1, -1,
                                          -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i AR = _mm_shuffle_epi8(Aa, ARm);
               __m128i BR = _mm_shuffle_epi8(Ba, BRm);
               __m128i CR = _mm_shuffle_epi8(Ca, CRm);
               __m128i R0 = _mm_or_si128(AR, BR);
               R0 = _mm_or_si128(R0, CR);

               AR = _mm_shuffle_epi8(Ab, ARm);
               BR = _mm_shuffle_epi8(Bb, BRm);
               CR = _mm_shuffle_epi8(Cb, CRm);
               __m128i R1 = _mm_or_si128(AR, BR);
               R1 = _mm_or_si128(R1, CR);

               __m128i AGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                          -1, -1, 15, 14,  9,  8,  3,  2);
               __m128i BGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 11, 10,
                                           5,  4, -1, -1, -1, -1, -1, -1);
               __m128i CGm = _mm_set_epi8(13, 12,  7,  6,  1,  0, -1, -1,
                                          -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i AG = _mm_shuffle_epi8(Aa, AGm);
               __m128i BG = _mm_shuffle_epi8(Ba, BGm);
               __m128i CG = _mm_shuffle_epi8(Ca, CGm);
               __m128i G0 = _mm_or_si128(AG, BG);
               G0 = _mm_or_si128(G0, CG);

               AG = _mm_shuffle_epi8(Ab, AGm);
               BG = _mm_shuffle_epi8(Bb, BGm);
               CG = _mm_shuffle_epi8(Cb, CGm);
               __m128i G1 = _mm_or_si128(AG, BG);
               G1 = _mm_or_si128(G1, CG);

               __m128i ABm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                          -1, -1, -1, -1, 11, 10,  5,  4);
               __m128i BBm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 12,
                                           7,  6,  1,  0, -1, -1, -1, -1);
               __m128i CBm = _mm_set_epi8(15, 14,  9,  8,  3,  2, -1, -1,
                                          -1, -1, -1, -1, -1, -1, -1, -1);
               __m128i AB = _mm_shuffle_epi8(Aa, ABm);
               __m128i BB = _mm_shuffle_epi8(Ba, BBm);
               __m128i CB = _mm_shuffle_epi8(Ca, CBm);
               __m128i B0 = _mm_or_si128(AB, BB);
               B0 = _mm_or_si128(B0, CB);

               AB = _mm_shuffle_epi8(Ab, ABm);
               BB = _mm_shuffle_epi8(Bb, BBm);
               CB = _mm_shuffle_epi8(Cb, CBm);
               __m128i B1 = _mm_or_si128(AB, BB);
               B1 = _mm_or_si128(B1, CB);

               ret.v0 = {mergeR};
               ret.v1 = {mergeG};
               ret.v2 = {mergeB};
               return ret;'''.format(mergeR=x86.setr('avx', typ, 'R0', 'R1'),
                                     mergeG=x86.setr('avx', typ, 'G0', 'G1'),
                                     mergeB=x86.setr('avx', typ, 'B0', 'B1'),
                                     **fmtspec)
    avx2_template = \
    '''nsimd_avx2_v{typ}x3 ret;
       {load_v0v1v2}

       __m256i RAm = _mm256_setr_epi32( 0,  3,  6, -1, -1, -1, -1, -1);
       __m256i RBm = _mm256_setr_epi32(-1, -1, -1,  1,  4,  7, -1, -1);
       __m256i RCm = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1,  2,  5);

       __m256i GAm = _mm256_setr_epi32( 1,  4,  7, -1, -1, -1, -1, -1);
       __m256i GBm = _mm256_setr_epi32(-1, -1, -1,  2,  5, -1, -1, -1);
       __m256i GCm = _mm256_setr_epi32(-1, -1, -1, -1, -1,  0,  3,  6);

       __m256i BAm = _mm256_setr_epi32( 2,  5, -1, -1, -1, -1, -1, -1);
       __m256i BBm = _mm256_setr_epi32(-1, -1,  0,  3,  6, -1, -1, -1);
       __m256i BCm = _mm256_setr_epi32(-1, -1, -1, -1, -1,  1,  4,  7);

       {styp} RA = _mm256_permutevar8x32{suf}(v0, RAm);
       {styp} RB = _mm256_permutevar8x32{suf}(v1, RBm);
       {styp} RC = _mm256_permutevar8x32{suf}(v2, RCm);

       {styp} R = _mm256_blend{suf}(RA, RB, 8 + 16 + 32);
       ret.v0 = _mm256_blend{suf}(R, RC, 64 + 128);

       {styp} GA = _mm256_permutevar8x32{suf}(v0, GAm);
       {styp} GB = _mm256_permutevar8x32{suf}(v1, GBm);
       {styp} GC = _mm256_permutevar8x32{suf}(v2, GCm);
       {styp} G = _mm256_blend{suf}(GA, GB, 8 + 16);
       ret.v1 = _mm256_blend{suf}(G, GC, 32 + 64 + 128);

       {styp} BA = _mm256_permutevar8x32{suf}(v0, BAm);
       {styp} BB = _mm256_permutevar8x32{suf}(v1, BBm);
       {styp} BC = _mm256_permutevar8x32{suf}(v2, BCm);
       {styp} B = _mm256_blend{suf}(BA, BB, 4 + 8 + 16);
       ret.v2 = _mm256_blend{suf}(B, BC, 32 + 64 + 128);

       return ret;'''.format(**fmtspec)
    if typ == 'f32':
        if simd_ext == 'avx2':
            return avx2_template
        else:
            return \
            '''nsimd_avx_v{typ}x3 ret;
               {load_v0v1v2}

               __m256i RAm = _mm256_setr_epi32( 0,  3, -1, -1, -1, -1,  2, -1);
               __m256i RBm = _mm256_setr_epi32(-1, -1, -1,  1,  0,  3, -1, -1);
               __m256i RCm = _mm256_setr_epi32( 0,  0,  2,  0,  1,  1,  1,  1);

               __m256i GAm = _mm256_setr_epi32( 1, -1, -1, -1, -1,  0,  3, -1);
               __m256i GBm = _mm256_setr_epi32(-1, -1, -1,  2,  5, -1, -1, -1);
               __m256i GCm = _mm256_setr_epi32(-1,  0,  3, -1, -1, -1, -1,  6);

               __m256i BAm = _mm256_setr_epi32( 2, -1, -1, -1, -1,  1, -1, -1);
               __m256i BBm = _mm256_setr_epi32(-1, -1,  0,  3,  6, -1, -1, -1);
               __m256i BCm = _mm256_setr_epi32(-1,  1, -1, -1, -1, -1,  4,  7);

               __m256 RA = _mm256_permutevar_ps(v0, RAm);
               __m256 RAi = _mm256_permute2f128_ps(RA, RA, (2 << 4) | 1);
               RA = _mm256_blend_ps(RAi, RA, 1 + 2);
               __m256 RB = _mm256_permutevar_ps(v1, RBm);
               __m256 RC = _mm256_permutevar_ps(v2, RCm);
               __m256 RCi = _mm256_permute2f128_ps(RC, RC, 2 << 4);
               RC = _mm256_blend_ps(RC, RCi, 64);
               __m256 R = _mm256_blend_ps(RA, RB, 8 + 16 + 32);
               ret.v0 = _mm256_blend_ps(R, RC, 64 + 128);

               __m256 GA = _mm256_permutevar_ps(v0, GAm);
               __m256 GAi = _mm256_permute2f128_ps(GA, GA, (2 << 4) | 1);
               GA = _mm256_blend_ps(GA, GAi, 2 + 4);
               __m256 GB = _mm256_permutevar_ps(v1, GBm);
               __m256 GC = _mm256_permutevar_ps(v2, GCm);
               __m256 GCi = _mm256_permute2f128_ps(GC, GC, 2 << 4);
               GC = _mm256_blend_ps(GC, GCi, 32 + 64);
               __m256 G = _mm256_blend_ps(GA, GB, 8 + 16);
               ret.v1 = _mm256_blend_ps(G, GC, 32 + 64 + 128);

               __m256 BA = _mm256_permutevar_ps(v0, BAm);
               __m256 BAi = _mm256_permute2f128_ps(BA, BA, (2 << 4) | 1);
               BA = _mm256_blend_ps(BA, BAi, 2);
               __m256 BB = _mm256_permutevar_ps(v1, BBm);
               __m256 BC = _mm256_permutevar_ps(v2, BCm);
               __m256 BCi = _mm256_permute2f128_ps(BC, BC, 2 << 4);
               BC = _mm256_blend_ps(BC, BCi, 32);
               __m256 B = _mm256_blend_ps(BA, BB, 4 + 8 + 16);
               ret.v2 = _mm256_blend_ps(B, BC, 32 + 64 + 128);

               return ret;'''.format(**fmtspec)
    if typ in ['i32', 'u32', 'f32']:
        if simd_ext == 'avx2':
            return avx2_template
        else:
            return \
            '''nsimd_avx_v{typ}x3 ret;
               nsimd_avx_vf32x3 retf32 = nsimd_load3{a}_avx_f32((f32 *){in0});
               ret.v0 = _mm256_castps_si256(retf32.v0);
               ret.v1 = _mm256_castps_si256(retf32.v1);
               ret.v2 = _mm256_castps_si256(retf32.v2);
               return ret;'''.format(**fmtspec)
    avx2_template = \
    '''nsimd_avx2_v{typ}x3 ret;
       {load_v0v1v2}
       {styp} A1 = _mm256_permute4x64{suf}(v0, _MM_SHUFFLE(2, 1, 3, 0));
       {styp} C2 = _mm256_permute4x64{suf}(v2, _MM_SHUFFLE(3, 0, 2, 1));
       {styp} B3 = _mm256_permute2f128{sufsi}(A1, v1, (2 << 4) | 1);
       {styp} B4 = _mm256_permute2f128{sufsi}(v1, C2, (2 << 4) | 1);
       {styp} B5 = _mm256_permute4x64{suf}(B3, _MM_SHUFFLE(3, 1, 2, 0));
       {styp} B6 = _mm256_permute4x64{suf}(B4, _MM_SHUFFLE(3, 1, 2, 0));
       ret.v0 = _mm256_permute2f128{sufsi}(A1, B6, 2 << 4);
       ret.v1 = _mm256_permute2f128{sufsi}(B5, B6, 3 << 4);
       ret.v2 = _mm256_permute2f128{sufsi}(B5, C2, (3 << 4 ) | 1);
       return ret;'''.format(**fmtspec)
    if typ == 'f64':
        if simd_ext == 'avx2':
            return avx2_template
        else:
            return \
            '''nsimd_avx_v{typ}x3 ret;
               {load_v0v1v2}

               __m256d R1 = _mm256_permute2f128_pd(v0, v2, (2 << 4) | 1);
               __m256d R2 = _mm256_permute2f128_pd(v0, v1, 3 << 4);
               ret.v0  = _mm256_blend_pd(R1, R2, 1 + 4);

               __m256d G1 = _mm256_permute2f128_pd(v0, v1, 3 << 4);
               __m256d G2 = _mm256_permute2f128_pd(v1, v2, 3 << 4);
               __m256d G  = _mm256_blend_pd(G1, G2, 1 + 4);
               ret.v1 = _mm256_permute_pd(G, 1 + 4);

               __m256d B1 = _mm256_permute2f128_pd(v0, v2, (2 << 4) | 1);
               __m256d B2 = _mm256_permute2f128_pd(v1, v2, 3 << 4);
               ret.v2  = _mm256_blend_pd(B1, B2, 2 + 8);

               return ret;'''.format(**fmtspec)
    if typ in ['i64', 'u64']:
        if simd_ext == 'avx2':
            return avx2_template
        else:
            return \
            '''nsimd_avx_v{typ}x3 ret;
               nsimd_avx_vf64x3 retf64 = nsimd_load3{a}_avx_f64((f64 *){in0});
               ret.v0 = _mm256_castpd_si256(retf64.v0);
               ret.v1 = _mm256_castpd_si256(retf64.v1);
               ret.v2 = _mm256_castpd_si256(retf64.v2);
               return ret;'''.format(**fmtspec)

###############################################################################

def store3_avx(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['exlo_in1'] = x86.extract('avx', typ, x86.LO, common.in1)
    fmtspec['exhi_in1'] = x86.extract('avx', typ, x86.HI, common.in1)
    fmtspec['exlo_in2'] = x86.extract('avx', typ, x86.LO, common.in2)
    fmtspec['exhi_in2'] = x86.extract('avx', typ, x86.HI, common.in2)
    fmtspec['exlo_in3'] = x86.extract('avx', typ, x86.LO, common.in3)
    fmtspec['exhi_in3'] = x86.extract('avx', typ, x86.HI, common.in3)
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        if simd_ext == 'avx2':
            return \
            '''__m256i RACm = _mm256_setr_epi8( 0, -1, -1,  1, -1, -1,  2, -1,
                                               -1,  3, -1, -1,  4, -1, -1,  5,
                                               -1, 27, -1, -1, 28, -1, -1, 29,
                                               -1, -1, 30, -1, -1, 31, -1, -1);
               __m256i RBBm = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13,
                                               -1, -1, 14, -1, -1, 15, -1, -1,
                                               16, -1, -1, 17, -1, -1, 18, -1,
                                               -1, 19, -1, -1, 20, -1, -1, 21);
               __m256i RCAm = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1,
                                               24, -1, -1, 25, -1, -1, 26, -1,
                                               -1, -1,  6, -1, -1,  7, -1, -1,
                                                8, -1, -1,  9, -1, -1, 10, -1);

               __m256i GACm = _mm256_setr_epi8(-1,  0, -1, -1,  1, -1, -1,  2,
                                               -1, -1,  3, -1, -1,  4, -1, -1,
                                               -1, -1, 27, -1, -1, 28, -1, -1,
                                               29, -1, -1, 30, -1, -1, 31, -1);
               __m256i GBBm = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1,
                                               13, -1, -1, 14, -1, -1, 15, -1,
                                               -1, 16, -1, -1, 17, -1, -1, 18,
                                               -1, -1, 19, -1, -1, 20, -1, -1);
               __m256i GCAm = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1,
                                               -1, 24, -1, -1, 25, -1, -1, 26,
                                                5, -1, -1,  6, -1, -1,  7, -1,
                                               -1,  8, -1, -1,  9, -1, -1, 10);

               __m256i BACm = _mm256_setr_epi8(-1, -1,  0, -1, -1,  1, -1, -1,
                                                2, -1, -1,  3, -1, -1,  4, -1,
                                               26, -1, -1, 27, -1, -1, 28, -1,
                                               -1, 29, -1, -1, 30, -1, -1, 31);
               __m256i BBBm = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1,
                                               -1, 13, -1, -1, 14, -1, -1, 15,
                                               -1, -1, 16, -1, -1, 17, -1, -1,
                                               18, -1, -1, 19, -1, -1, 20, -1);
               __m256i BCAm = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23,
                                               -1, -1, 24, -1, -1, 25, -1, -1,
                                               -1,  5, -1, -1,  6, -1, -1,  7,
                                               -1, -1,  8, -1, -1,  9, -1, -1);

               __m256i RAC = _mm256_shuffle_epi8({in1}, RACm);
               __m256i GAC = _mm256_shuffle_epi8({in2}, GACm);
               __m256i BAC = _mm256_shuffle_epi8({in3}, BACm);

               __m256i RBB = _mm256_shuffle_epi8({in1}, RBBm);
               __m256i GBB = _mm256_shuffle_epi8({in2}, GBBm);
               __m256i BBB = _mm256_shuffle_epi8({in3}, BBBm);

               __m256i RCA = _mm256_shuffle_epi8({in1}, RCAm);
               __m256i GCA = _mm256_shuffle_epi8({in2}, GCAm);
               __m256i BCA = _mm256_shuffle_epi8({in3}, BCAm);

               __m256i AC = _mm256_or_si256(RAC, GAC);
               AC = _mm256_or_si256(AC, BAC);

               __m256i B = _mm256_or_si256(RBB, GBB);
               B = _mm256_or_si256(B, BBB);

               __m256i CA = _mm256_or_si256(RCA, GCA);
               CA = _mm256_or_si256(CA, BCA);

               __m256i A = _mm256_permute2f128_si256(AC, CA, 2 << 4);
               __m256i C = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);

               {store}'''.format(store=store3('avx', typ, align, fmtspec,
                                              'A', 'B', 'C'), **fmtspec)
        else:
            return \
            '''__m128i Ra = {exlo_in1};
               __m128i Rb = {exhi_in1};
               __m128i Ga = {exlo_in2};
               __m128i Gb = {exhi_in2};
               __m128i Ba = {exlo_in3};
               __m128i Bb = {exhi_in3};

               __m128i RAm = _mm_set_epi8( 5, -1, -1,  4, -1, -1,  3, -1,
                                          -1,  2, -1, -1,  1, -1, -1,  0);
               __m128i GAm = _mm_set_epi8(-1, -1,  4, -1, -1,  3, -1, -1,
                                           2, -1, -1,  1, -1, -1,  0, -1);
               __m128i BAm = _mm_set_epi8(-1,  4, -1, -1,  3, -1, -1,  2,
                                          -1, -1,  1, -1, -1,  0, -1, -1);
               __m128i RA = _mm_shuffle_epi8(Ra, RAm);
               __m128i GA = _mm_shuffle_epi8(Ga, GAm);
               __m128i BA = _mm_shuffle_epi8(Ba, BAm);
               __m128i A0 = _mm_or_si128(RA, GA);
               A0 = _mm_or_si128(A0, BA);

               RA = _mm_shuffle_epi8(Rb, RAm);
               GA = _mm_shuffle_epi8(Gb, GAm);
               BA = _mm_shuffle_epi8(Bb, BAm);
               __m128i A1 = _mm_or_si128(RA, GA);
               A1 = _mm_or_si128(A1, BA);

               __m128i RBm = _mm_set_epi8(-1, 10, -1, -1,  9, -1, -1,  8,
                                          -1, -1,  7, -1, -1,  6, -1, -1);
               __m128i GBm = _mm_set_epi8(10, -1, -1,  9, -1, -1,  8, -1,
                                          -1,  7, -1, -1,  6, -1, -1,  5);
               __m128i BBm = _mm_set_epi8(-1, -1,  9, -1, -1,  8, -1, -1,
                                           7, -1, -1,  6, -1, -1,  5, -1);
               __m128i RB = _mm_shuffle_epi8(Ra, RBm);
               __m128i GB = _mm_shuffle_epi8(Ga, GBm);
               __m128i BB = _mm_shuffle_epi8(Ba, BBm);
               __m128i B0 = _mm_or_si128(RB, GB);
               B0 = _mm_or_si128(B0, BB);

               RB = _mm_shuffle_epi8(Rb, RBm);
               GB = _mm_shuffle_epi8(Gb, GBm);
               BB = _mm_shuffle_epi8(Bb, BBm);
               __m128i B1 = _mm_or_si128(RB, GB);
               B1 = _mm_or_si128(B1, BB);

               __m128i RCm = _mm_set_epi8(-1, -1, 15, -1, -1, 14, -1, -1,
                                          13, -1, -1, 12, -1, -1, 11, -1);
               __m128i GCm = _mm_set_epi8(-1, 15, -1, -1, 14, -1, -1, 13,
                                          -1, -1, 12, -1, -1, 11, -1, -1);
               __m128i BCm = _mm_set_epi8(15, -1, -1, 14, -1, -1, 13, -1,
                                          -1, 12, -1, -1, 11, -1, -1, 10);
               __m128i RC = _mm_shuffle_epi8(Ra, RCm);
               __m128i GC = _mm_shuffle_epi8(Ga, GCm);
               __m128i BC = _mm_shuffle_epi8(Ba, BCm);
               __m128i C0 = _mm_or_si128(RC, GC);
               C0 = _mm_or_si128(C0, BC);

               RC = _mm_shuffle_epi8(Rb, RCm);
               GC = _mm_shuffle_epi8(Gb, GCm);
               BC = _mm_shuffle_epi8(Bb, BCm);
               __m128i C1 = _mm_or_si128(RC, GC);
               C1 = _mm_or_si128(C1, BC);

               __m256i A = {mergeA0B0};
               __m256i B = {mergeC0A1};
               __m256i C = {mergeB1C1};

               {store}'''.format(mergeA0B0=x86.setr('avx', typ, 'A0', 'B0'),
                                 mergeC0A1=x86.setr('avx', typ, 'C0', 'A1'),
                                 mergeB1C1=x86.setr('avx', typ, 'B1', 'C1'),
                                 store=store3('avx', typ, align, fmtspec,
                                              'A', 'B', 'C'), **fmtspec)
    if typ in ['i16', 'u16']:
        if simd_ext == 'avx2':
            return \
            '''__m256i RACm = _mm256_setr_epi8( 0,  1, -1, -1, -1, -1,  2,  3,
                                               -1, -1, -1, -1,  4,  5, -1, -1,
                                               -1, -1, -1, -1, 12, 13, -1, -1,
                                               -1, -1, 14, 15, -1, -1, -1, -1);
               __m256i RBBm = _mm256_setr_epi8(-1, -1, -1, -1, 12, 13, -1, -1,
                                               -1, -1, 14, 15, -1, -1, -1, -1,
                                                0,  1, -1, -1, -1, -1,  2,  3,
                                               -1, -1, -1, -1,  4,  5, -1, -1);
               __m256i RCAm = _mm256_setr_epi8(-1, -1,  6,  7, -1, -1, -1, -1,
                                                8,  9, -1, -1, -1, -1, 10, 11,
                                               -1, -1,  6,  7, -1, -1, -1, -1,
                                                8,  9, -1, -1, -1, -1, 10, 11);

               __m256i GACm = _mm256_setr_epi8(-1, -1,  0,  1, -1, -1, -1, -1,
                                                2,  3, -1, -1, -1, -1,  4,  5,
                                               10, 11, -1, -1, -1, -1, 12, 13,
                                               -1, -1, -1, -1, 14, 15, -1, -1);
               __m256i GBBm = _mm256_setr_epi8(10, 11, -1, -1, -1, -1, 12, 13,
                                               -1, -1, -1, -1, 14, 15, -1, -1,
                                               -1, -1,  0,  1, -1, -1, -1, -1,
                                                2,  3, -1, -1, -1, -1,  4,  5);
               __m256i GCAm = _mm256_setr_epi8(-1, -1, -1, -1,  6,  7, -1, -1,
                                               -1, -1,  8,  9, -1, -1, -1, -1,
                                               -1, -1, -1, -1,  6,  7, -1, -1,
                                               -1, -1,  8,  9, -1, -1, -1, -1);

               __m256i BACm = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1, -1, -1,
                                               -1, -1,  2,  3, -1, -1, -1, -1,
                                               -1, -1, 10, 11, -1, -1, -1, -1,
                                               12, 13, -1, -1, -1, -1, 14, 15);
               __m256i BBBm = _mm256_setr_epi8(-1, -1, 10, 11, -1, -1, -1, -1,
                                               12, 13, -1, -1, -1, -1, 14, 15,
                                               -1, -1, -1, -1,  0,  1, -1, -1,
                                               -1, -1,  2,  3, -1, -1, -1, -1);
               __m256i BCAm = _mm256_setr_epi8( 4,  5, -1, -1, -1, -1,  6,  7,
                                               -1, -1, -1, -1,  8,  9, -1, -1,
                                                4,  5, -1, -1, -1, -1,  6,  7,
                                                -1, -1, -1, -1,  8,  9, -1, -1);

               __m256i RAC = _mm256_shuffle_epi8({in1}, RACm);
               __m256i GAC = _mm256_shuffle_epi8({in2}, GACm);
               __m256i BAC = _mm256_shuffle_epi8({in3}, BACm);

               __m256i RBB = _mm256_shuffle_epi8({in1}, RBBm);
               __m256i GBB = _mm256_shuffle_epi8({in2}, GBBm);
               __m256i BBB = _mm256_shuffle_epi8({in3}, BBBm);

               __m256i RCA = _mm256_shuffle_epi8({in1}, RCAm);
               __m256i GCA = _mm256_shuffle_epi8({in2}, GCAm);
               __m256i BCA = _mm256_shuffle_epi8({in3}, BCAm);

               __m256i AC = _mm256_or_si256(RAC, GAC);
               AC = _mm256_or_si256(AC, BAC);

               __m256i B = _mm256_or_si256(RBB, GBB);
               B = _mm256_or_si256(B, BBB);

               __m256i CA = _mm256_or_si256(RCA, GCA);
               CA = _mm256_or_si256(CA, BCA);

               __m256i A = _mm256_permute2f128_si256(AC, CA, 2 << 4);
               __m256i C = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);

               {store}'''.format(store=store3('avx', typ, align, fmtspec,
                                              'A', 'B', 'C'), **fmtspec)
        else:
            return \
            '''__m128i Ra = {exlo_in1};
               __m128i Rb = {exhi_in1};
               __m128i Ga = {exlo_in2};
               __m128i Gb = {exhi_in2};
               __m128i Ba = {exlo_in3};
               __m128i Bb = {exhi_in3};

               __m128i RAm = _mm_set_epi8(-1, -1,  5,  4, -1, -1, -1, -1,
                                           3,  2, -1, -1, -1, -1,  1,  0);
               __m128i GAm = _mm_set_epi8( 5,  4, -1, -1, -1, -1,  3,  2,
                                          -1, -1, -1, -1,  1,  0, -1, -1);
               __m128i BAm = _mm_set_epi8(-1, -1, -1, -1,  3,  2, -1, -1,
                                          -1, -1,  1,  0, -1, -1, -1, -1);
               __m128i RA = _mm_shuffle_epi8(Ra, RAm);
               __m128i GA = _mm_shuffle_epi8(Ga, GAm);
               __m128i BA = _mm_shuffle_epi8(Ba, BAm);
               __m128i A0 = _mm_or_si128(RA, GA);
               A0 = _mm_or_si128(A0, BA);

               RA = _mm_shuffle_epi8(Rb, RAm);
               GA = _mm_shuffle_epi8(Gb, GAm);
               BA = _mm_shuffle_epi8(Bb, BAm);
               __m128i A1 = _mm_or_si128(RA, GA);
               A1 = _mm_or_si128(A1, BA);

               __m128i RBm = _mm_set_epi8(11, 10, -1, -1, -1, -1,  9,  8,
                                          -1, -1, -1, -1,  7,  6, -1, -1);
               __m128i GBm = _mm_set_epi8(-1, -1, -1, -1,  9,  8, -1, -1,
                                          -1, -1,  7,  6, -1, -1, -1, -1);
               __m128i BBm = _mm_set_epi8(-1, -1,  9,  8, -1, -1, -1, -1,
                                           7,  6, -1, -1, -1, -1,  5,  4);
               __m128i RB = _mm_shuffle_epi8(Ra, RBm);
               __m128i GB = _mm_shuffle_epi8(Ga, GBm);
               __m128i BB = _mm_shuffle_epi8(Ba, BBm);
               __m128i B0 = _mm_or_si128(RB, GB);
               B0 = _mm_or_si128(B0, BB);

               RB = _mm_shuffle_epi8(Rb, RBm);
               GB = _mm_shuffle_epi8(Gb, GBm);
               BB = _mm_shuffle_epi8(Bb, BBm);
               __m128i B1 = _mm_or_si128(RB, GB);
               B1 = _mm_or_si128(B1, BB);

               __m128i RCm = _mm_set_epi8(-1, -1, -1, -1, 15, 14, -1, -1,
                                          -1, -1, 13, 12, -1, -1, -1, -1);
               __m128i GCm = _mm_set_epi8(-1, -1, 15, 14, -1, -1, -1, -1,
                                          13, 12, -1, -1, -1, -1, 11, 10);
               __m128i BCm = _mm_set_epi8(15, 14, -1, -1, -1, -1, 13, 12,
                                          -1, -1, -1, -1, 11, 10, -1, -1);
               __m128i RC = _mm_shuffle_epi8(Ra, RCm);
               __m128i GC = _mm_shuffle_epi8(Ga, GCm);
               __m128i BC = _mm_shuffle_epi8(Ba, BCm);
               __m128i C0 = _mm_or_si128(RC, GC);
               C0 = _mm_or_si128(C0, BC);

               RC = _mm_shuffle_epi8(Rb, RCm);
               GC = _mm_shuffle_epi8(Gb, GCm);
               BC = _mm_shuffle_epi8(Bb, BCm);
               __m128i C1 = _mm_or_si128(RC, GC);
               C1 = _mm_or_si128(C1, BC);

               __m256i A = {mergeA0B0};
               __m256i B = {mergeC0A1};
               __m256i C = {mergeB1C1};

               {store}'''.format(mergeA0B0=x86.setr('avx', typ, 'A0', 'B0'),
                                 mergeC0A1=x86.setr('avx', typ, 'C0', 'A1'),
                                 mergeB1C1=x86.setr('avx', typ, 'B1', 'C1'),
                                 store=store3('avx', typ, align, fmtspec,
                                              'A', 'B', 'C'), **fmtspec)
    avx2_template = \
    '''__m256i RAm = _mm256_setr_epi32( 0, -1, -1,  1, -1, -1,  2, -1);
       __m256i RBm = _mm256_setr_epi32(-1,  3, -1, -1,  4, -1, -1,  5);
       __m256i RCm = _mm256_setr_epi32(-1, -1,  6, -1, -1,  7, -1, -1);

       __m256i GAm = _mm256_setr_epi32(-1,  0, -1, -1,  1, -1, -1,  2);
       __m256i GBm = _mm256_setr_epi32(-1, -1,  3, -1, -1,  4, -1, -1);
       __m256i GCm = _mm256_setr_epi32( 5, -1, -1,  6, -1, -1,  7, -1);

       __m256i BAm = _mm256_setr_epi32(-1, -1,  0, -1, -1,  1, -1, -1);
       __m256i BBm = _mm256_setr_epi32( 2, -1, -1,  3, -1, -1,  4, -1);
       __m256i BCm = _mm256_setr_epi32(-1,  5, -1, -1,  6, -1, -1,  7);

       {styp} RA = _mm256_permutevar8x32{suf}({in1}, RAm);
       {styp} RB = _mm256_permutevar8x32{suf}({in1}, RBm);
       {styp} RC = _mm256_permutevar8x32{suf}({in1}, RCm);

       {styp} GA = _mm256_permutevar8x32{suf}({in2}, GAm);
       {styp} GB = _mm256_permutevar8x32{suf}({in2}, GBm);
       {styp} GC = _mm256_permutevar8x32{suf}({in2}, GCm);

       {styp} BA = _mm256_permutevar8x32{suf}({in3}, BAm);
       {styp} BB = _mm256_permutevar8x32{suf}({in3}, BBm);
       {styp} BC = _mm256_permutevar8x32{suf}({in3}, BCm);

       {styp} A = _mm256_blend{suf}(RA, GA, 2 + 16 + 128);
       A = _mm256_blend{suf}(A, BA, 4 + 32);

       {styp} B = _mm256_blend{suf}(RB, GB, 4 + 32);
       B = _mm256_blend{suf}(B, BB, 1 + 8 + 64);

       {styp} C = _mm256_blend{suf}(RC, GC, 1 + 8 + 64);
       C = _mm256_blend{suf}(C, BC, 2 + 16 + 128);

       {store}'''.format(store=store3('avx', typ, align, fmtspec,
                                      'A', 'B', 'C'), **fmtspec)
    if typ == 'f32':
        if simd_ext == 'avx2':
            return avx2_template
        else:
            return \
            '''__m256i RAm = _mm256_setr_epi32( 0, -1, -1,  1, -1, -1,  2, -1);
               __m256i RBm = _mm256_setr_epi32(-1,  3, -1, -1,  4, -1, -1,  5);
               __m256i RCm = _mm256_setr_epi32(-1, -1,  6, -1, -1,  7, -1, -1);

               __m256i GAm = _mm256_setr_epi32(-1,  0, -1, -1,  1, -1, -1,  2);
               __m256i GBm = _mm256_setr_epi32(-1, -1,  3, -1, -1,  4, -1, -1);
               __m256i GCm = _mm256_setr_epi32( 5, -1, -1,  6, -1, -1,  7, -1);

               __m256i BAm = _mm256_setr_epi32(-1, -1,  0, -1, -1,  1, -1, -1);
               __m256i BBm = _mm256_setr_epi32( 2, -1, -1,  3, -1, -1,  4, -1);
               __m256i BCm = _mm256_setr_epi32(-1,  5, -1, -1,  6, -1, -1,  7);

               __m256 RA = _mm256_permutevar_ps({in1}, RAm);
               __m256 RB = _mm256_permutevar_ps({in1}, RBm);
               __m256 RC = _mm256_permutevar_ps({in1}, RCm);

               __m256 GA = _mm256_permutevar_ps({in2}, GAm);
               __m256 GB = _mm256_permutevar_ps({in2}, GBm);
               __m256 GC = _mm256_permutevar_ps({in2}, GCm);

               __m256 BA = _mm256_permutevar_ps({in3}, BAm);
               __m256 BB = _mm256_permutevar_ps({in3}, BBm);
               __m256 BC = _mm256_permutevar_ps({in3}, BCm);

               __m256 A1 = _mm256_blend_ps(RA, GA, 2 + 16 + 128);
               A1 = _mm256_blend_ps(A1, BA, 4 + 32);

               __m256 B = _mm256_blend_ps(RB, GB, 4 + 32);
               B = _mm256_blend_ps(B, BB, 1 + 8 + 64);

               __m256 C1 = _mm256_blend_ps(RC, GC, 1 + 8 + 64);
               C1 = _mm256_blend_ps(C1, BC, 2 + 16 + 128);

               __m256 A = _mm256_permute2f128_ps(A1, C1, 2 << 4);
               __m256 C = _mm256_permute2f128_ps(A1, C1, (3 << 4) | 1);

               {store}'''.format(avx2_template=avx2_template,
                                 store=store3('avx', typ, align, fmtspec,
                                              'A', 'B', 'C'), **fmtspec)
    if typ in ['i32', 'u32']:
        if simd_ext == 'avx2':
            return avx2_template
        else:
            return \
            '''nsimd_store3{a}_avx_f32((f32 *){in0},
                                       _mm256_castsi256_ps({in1}),
                                       _mm256_castsi256_ps({in2}),
                                       _mm256_castsi256_ps({in3}));'''. \
                                       format(**fmtspec)
    if typ == 'f64':
        return \
        '''__m256d invv1 = _mm256_permute_pd({in2}, 1 + 4);
           __m256d A1C0 = _mm256_blend_pd({in1}, {in3}, 1 + 4);
           __m256d A0B1 = _mm256_blend_pd({in1}, invv1, 2 + 8);
           __m256d B0C1 = _mm256_blend_pd(invv1, {in3}, 2 + 8);

           __m256d A = _mm256_permute2f128_pd(A0B1, A1C0, 2 << 4);
           __m256d B = _mm256_blend_pd(B0C1, A0B1, 4 + 8);
           __m256d C = _mm256_permute2f128_pd(A1C0, B0C1, (3 << 4) |  1);

           {store}'''.format(store=store3('avx', typ, align, fmtspec,
                                          'A', 'B', 'C'), **fmtspec)
    if typ in ['i64', 'u64']:
        return \
        '''nsimd_store3{a}_{simd_ext}_f64((f64 *){in0},
                                          _mm256_castsi256_pd({in1}),
                                          _mm256_castsi256_pd({in2}),
                                          _mm256_castsi256_pd({in3}));'''. \
                                          format(**fmtspec)

###############################################################################

def load3_avx512(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['load_v0v1v2'] = get_load_v0v1v2(simd_ext, typ, align, fmtspec)
    fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0')
    fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0')
    fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1')
    fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1')
    fmtspec['exlo_v2'] = x86.extract(simd_ext, typ, x86.LO, 'v2')
    fmtspec['exhi_v2'] = x86.extract(simd_ext, typ, x86.HI, 'v2')
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;

           {load_v0v1v2}

           __m256i A0in = {exlo_v0};
           __m256i B0in = {exhi_v0};
           __m256i C0in = {exlo_v1};
           __m256i A1in = {exhi_v1};
           __m256i B1in = {exlo_v2};
           __m256i C1in = {exhi_v2};

	   __m256i ARmask = _mm256_setr_epi8( 0,  3,  6,  9, 12, 15, -1, -1,
	                                     -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1,  2,  5,
                                              8, 11, 14, -1, -1, -1, -1, -1);
           __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1,  1,  4,  7, 10, 13,
                                              0,  3,  6,  9, 12, 15, -1, -1,
                                              -1, -1, -1, -1, -1, -1, -1, -1);
           __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  5,
                                              8, 11, 14, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1,  1,  4,  7, 10, 13);

           __m256i AR = _mm256_shuffle_epi8(A0in, ARmask);
           __m256i BR = _mm256_shuffle_epi8(B0in, BRmask);
           __m256i CR = _mm256_shuffle_epi8(C0in, CRmask);
           __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);

           __m256i R0 = _mm256_or_si256(AR, BR);
           __m256i R1 = _mm256_or_si256(BR, CR);
           __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);
           __m256i R3 = _mm256_or_si256(DR, R2);

           AR = _mm256_shuffle_epi8(A1in, ARmask);
           BR = _mm256_shuffle_epi8(B1in, BRmask);
           CR = _mm256_shuffle_epi8(C1in, CRmask);
           DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);

           R0 = _mm256_or_si256(AR, BR);
           R1 = _mm256_or_si256(BR, CR);
           R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);
           __m256i R3b = _mm256_or_si256(DR, R2);

           __m256i AGmask = _mm256_setr_epi8( 1,  4,  7, 10, 13, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1,  0,  3,  6,
                                              9, 12, 15, -1, -1, -1, -1, -1);
           __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1,  2,  5,  8, 11, 14,
                                              1,  4,  7, 10, 13, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1);
           __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  0,  3,  6,
                                              9, 12, 15, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1,  2,  5,  8, 11, 14);

           __m256i AG = _mm256_shuffle_epi8(A0in, AGmask);
           __m256i BG = _mm256_shuffle_epi8(B0in, BGmask);
           __m256i CG = _mm256_shuffle_epi8(C0in, CGmask);
           __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);

           __m256i G0 = _mm256_or_si256(AG, BG);
           __m256i G1 = _mm256_or_si256(BG, CG);
           __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);
           __m256i G3 = _mm256_or_si256(DG, G2);

           AG = _mm256_shuffle_epi8(A1in, AGmask);
           BG = _mm256_shuffle_epi8(B1in, BGmask);
           CG = _mm256_shuffle_epi8(C1in, CGmask);
           DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);

           G0 = _mm256_or_si256(AG, BG);
           G1 = _mm256_or_si256(BG, CG);
           G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);
           __m256i G3b = _mm256_or_si256(DG, G2);

           __m256i ABmask = _mm256_setr_epi8( 2,  5,  8, 11, 14, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1,  1,  4,  7,
                                             10, 13, -1, -1, -1, -1, -1, -1);
           __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1,  0,  3,  6,  9, 12, 15,
                                              2,  5,  8, 11, 14, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1);
           __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  1,  4,  7,
                                             10, 13, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1,  0,  3,  6,  9, 12, 15);

           __m256i AB = _mm256_shuffle_epi8(A0in, ABmask);
           __m256i BB = _mm256_shuffle_epi8(B0in, BBmask);
           __m256i CB = _mm256_shuffle_epi8(C0in, CBmask);
           __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);

           __m256i B0 = _mm256_or_si256(AB, BB);
           __m256i B1 = _mm256_or_si256(BB, CB);
           __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);
           __m256i B3 = _mm256_or_si256(DB, B2);

           AB = _mm256_shuffle_epi8(A1in, ABmask);
           BB = _mm256_shuffle_epi8(B1in, BBmask);
           CB = _mm256_shuffle_epi8(C1in, CBmask);
           DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);

           B0 = _mm256_or_si256(AB, BB);
           B1 = _mm256_or_si256(BB, CB);
           B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);
           __m256i B3b = _mm256_or_si256(DB, B2);

           ret.v0 = {mergeR};
           ret.v1 = {mergeG};
           ret.v2 = {mergeB};

           return ret;'''. \
           format(mergeR=x86.setr(simd_ext, typ, 'R3', 'R3b'),
                  mergeG=x86.setr(simd_ext, typ, 'G3', 'G3b'),
                  mergeB=x86.setr(simd_ext, typ, 'B3', 'B3b'),
                  **fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;

           {load_v0v1v2}

           __m256i A0a = {exlo_v0};
           __m256i B0a = {exhi_v0};
           __m256i C0a = {exlo_v1};
           __m256i A0b = {exhi_v1};
           __m256i B0b = {exlo_v2};
           __m256i C0b = {exhi_v2};

           __m256i ARmask = _mm256_setr_epi8( 0,  1,  6,  7, 12, 13, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1,  2,  3,
                                              8,  9, 14, 15, -1, -1, -1, -1);
           __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1,  4,  5, 10, 11,
                                              0,  1,  6,  7, 12, 13, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1);
           __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  3,
                                              8,  9, 14, 15, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1,  4,  5, 10, 11);

           __m256i AR = _mm256_shuffle_epi8(A0a, ARmask);
           __m256i BR = _mm256_shuffle_epi8(B0a, BRmask);
           __m256i CR = _mm256_shuffle_epi8(C0a, CRmask);
           __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);

           __m256i R0 = _mm256_or_si256(AR, BR);
           __m256i R1 = _mm256_or_si256(BR, CR);
           __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);
           __m256i R3a = _mm256_or_si256(DR, R2);

           AR = _mm256_shuffle_epi8(A0b, ARmask);
           BR = _mm256_shuffle_epi8(B0b, BRmask);
           CR = _mm256_shuffle_epi8(C0b, CRmask);
           DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);

           R0 = _mm256_or_si256(AR, BR);
           R1 = _mm256_or_si256(BR, CR);
           R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);
           __m256i R3b = _mm256_or_si256(DR, R2);

           __m256i AGmask = _mm256_setr_epi8( 2,  3,  8,  9, 14, 15, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1,  4,  5,
                                             10, 11, -1, -1, -1, -1, -1, -1);
           __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1,  0,  1,  6,  7, 12, 13,
                                              2,  3,  8,  9, 14, 15, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1);
           __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  4,  5,
                                             10, 11, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1,  0,  1,  6,  7, 12, 13);

           __m256i AG = _mm256_shuffle_epi8(A0a, AGmask);
           __m256i BG = _mm256_shuffle_epi8(B0a, BGmask);
           __m256i CG = _mm256_shuffle_epi8(C0a, CGmask);
           __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);

           __m256i G0 = _mm256_or_si256(AG, BG);
           __m256i G1 = _mm256_or_si256(BG, CG);
           __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);
           __m256i G3a = _mm256_or_si256(DG, G2);

           AG = _mm256_shuffle_epi8(A0b, AGmask);
           BG = _mm256_shuffle_epi8(B0b, BGmask);
           CG = _mm256_shuffle_epi8(C0b, CGmask);
           DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);

           G0 = _mm256_or_si256(AG, BG);
           G1 = _mm256_or_si256(BG, CG);
           G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);
           __m256i G3b = _mm256_or_si256(DG, G2);

           __m256i ABmask = _mm256_setr_epi8( 4,  5, 10, 11, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1,  0,  1,  6,  7,
                                             12, 13, -1, -1, -1, -1, -1, -1);
           __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1,  2,  3,  8,  9, 14, 15,
                                              4,  5, 10, 11, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1);
           __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1,  6,  7,
                                             12, 13, -1, -1, -1, -1, -1, -1,
                                             -1, -1, -1, -1, -1, -1, -1, -1,
                                             -1, -1,  2,  3,  8,  9, 14, 15);

           __m256i AB = _mm256_shuffle_epi8(A0a, ABmask);
           __m256i BB = _mm256_shuffle_epi8(B0a, BBmask);
           __m256i CB = _mm256_shuffle_epi8(C0a, CBmask);
           __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);

           __m256i B0 = _mm256_or_si256(AB, BB);
           __m256i B1 = _mm256_or_si256(BB, CB);
           __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);
           __m256i B3a = _mm256_or_si256(DB, B2);

           AB = _mm256_shuffle_epi8(A0b, ABmask);
           BB = _mm256_shuffle_epi8(B0b, BBmask);
           CB = _mm256_shuffle_epi8(C0b, CBmask);
           DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);

           B0 = _mm256_or_si256(AB, BB);
           B1 = _mm256_or_si256(BB, CB);
           B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);
           __m256i B3b = _mm256_or_si256(DB, B2);

           ret.v0 = {mergeR};
           ret.v1 = {mergeG};
           ret.v2 = {mergeB};

           return ret;'''. \
           format(mergeR=x86.setr(simd_ext, typ, 'R3a', 'R3b'),
                  mergeG=x86.setr(simd_ext, typ, 'G3a', 'G3b'),
                  mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'),
                  **fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;

           {load_v0v1v2}

           __m512i RABm  = _mm512_setr_epi32( 0,  3,  6,  9, 12, 15, 18, 21,
                                             24, 27, 30,  0,  0,  0 , 0,  0);
           __m512i RABCm = _mm512_setr_epi32( 0,  1,  2,  3,  4,  5,  6 , 7,
                                              8,  9, 10, 17, 20, 23, 26, 29);
           __m512i GABm  = _mm512_setr_epi32( 1,  4,  7, 10, 13, 16, 19, 22,
                                             25, 28, 31,  0,  0,  0 , 0,  0);
           __m512i GABCm = _mm512_setr_epi32( 0,  1,  2,  3,  4,  5,  6 , 7,
                                              8,  9, 10, 18, 21, 24, 27, 30);
           __m512i BABm  = _mm512_setr_epi32( 2,  5,  8, 11, 14, 17, 20, 23,
                                             26, 29,  0,  0,  0,  0 , 0,  0);
           __m512i BABCm = _mm512_setr_epi32( 0,  1,  2,  3,  4,  5,  6 , 7,
                                              8,  9, 16, 19, 22, 25, 28, 31);

           {styp} R = _mm512_permutex2var{suf}(v0, RABm, v1);
           ret.v0 = _mm512_permutex2var{suf}(R, RABCm, v2);
           {styp} G = _mm512_permutex2var{suf}(v0, GABm, v1);
           ret.v1 = _mm512_permutex2var{suf}(G, GABCm, v2);
           {styp} B = _mm512_permutex2var{suf}(v0, BABm, v1);
           ret.v2 = _mm512_permutex2var{suf}(B, BABCm, v2);

           return ret;'''.format(**fmtspec)
    if typ in ['f64', 'i64', 'u64']:
        return \
        '''nsimd_{simd_ext}_v{typ}x3 ret;

           {load_v0v1v2}

           __m512i R_mask0 = _mm512_set_epi64( 0,  0, 15, 12, 9, 6, 3, 0);
           __m512i R_mask1 = _mm512_set_epi64(13, 10,  5,  4, 3, 2, 1, 0);
           {styp} A1 = _mm512_permutex2var{suf}(v0, R_mask0, v1);
           ret.v0 = _mm512_permutex2var{suf}(A1, R_mask1, v2);

           __m512i G_mask0 = _mm512_set_epi64( 0,  0,  0, 13, 10, 7, 4, 1);
           __m512i G_mask1 = _mm512_set_epi64(14, 11,  8,  4,  3, 2, 1, 0);
           {styp} B1 = _mm512_permutex2var{suf}(v0, G_mask0, v1);
           ret.v1 = _mm512_permutex2var{suf}(B1, G_mask1, v2);

           __m512i B_mask0 = _mm512_set_epi64( 0,  0,  0, 14, 11, 8, 5, 2);
           __m512i B_mask1 = _mm512_set_epi64(15, 12,  9,  4,  3, 2, 1, 0);
           {styp} C1 = _mm512_permutex2var{suf}(v0, B_mask0, v1);
           ret.v2 = _mm512_permutex2var{suf}(C1, B_mask1, v2);

           return ret;'''.format(**fmtspec)

###############################################################################

def store3_avx512(simd_ext, typ, align, fmtspec2):
    fmtspec = fmtspec2.copy()
    fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1)
    fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1)
    fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2)
    fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2)
    fmtspec['exlo_in3'] = x86.extract(simd_ext, typ, x86.LO, common.in3)
    fmtspec['exhi_in3'] = x86.extract(simd_ext, typ, x86.HI, common.in3)
    fmtspec['a'] = 'a' if align else 'u'
    if typ in ['i8', 'u8']:
        return \
        '''__m256i R0 = {exlo_in1};
           __m256i R1 = {exhi_in1};
           __m256i G0 = {exlo_in2};
           __m256i G1 = {exhi_in2};
           __m256i B0 = {exlo_in3};
           __m256i B1 = {exhi_in3};


           __m256i RACm = _mm256_setr_epi8( 0, -1, -1,  1, -1, -1,  2, -1,
                                           -1,  3, -1, -1,  4, -1, -1,  5,
                                           -1, 27, -1, -1, 28, -1, -1, 29,
                                           -1, -1, 30, -1, -1, 31, -1, -1);
           __m256i RBBm = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13,
                                           -1, -1, 14, -1, -1, 15, -1, -1,
                                           16, -1, -1, 17, -1, -1, 18, -1,
                                           -1, 19, -1, -1, 20, -1, -1, 21);
           __m256i RCAm = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1,
                                           24, -1, -1, 25, -1, -1, 26, -1,
                                           -1, -1,  6, -1, -1,  7, -1, -1,
                                            8, -1, -1,  9, -1, -1, 10, -1);

           __m256i GACm = _mm256_setr_epi8(-1,  0, -1, -1,  1, -1, -1,  2,
                                           -1, -1,  3, -1, -1,  4, -1, -1,
                                           -1, -1, 27, -1, -1, 28, -1, -1,
                                           29, -1, -1, 30, -1, -1, 31, -1);
           __m256i GBBm = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1,
                                           13, -1, -1, 14, -1, -1, 15, -1,
                                           -1, 16, -1, -1, 17, -1, -1, 18,
                                           -1, -1, 19, -1, -1, 20, -1, -1);
           __m256i GCAm = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1,
                                           -1, 24, -1, -1, 25, -1, -1, 26,
                                           05, -1, -1,  6, -1, -1,  7, -1,
                                           -1,  8, -1, -1,  9, -1, -1, 10);

           __m256i BACm = _mm256_setr_epi8(-1, -1,  0, -1, -1,  1, -1, -1,
                                            2, -1, -1,  3, -1, -1,  4, -1,
                                           26, -1, -1, 27, -1, -1, 28, -1,
                                           -1, 29, -1, -1, 30, -1, -1, 31);
           __m256i BBBm = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1,
                                           -1, 13, -1, -1, 14, -1, -1, 15,
                                           -1, -1, 16, -1, -1, 17, -1, -1,
                                           18, -1, -1, 19, -1, -1, 20, -1);
           __m256i BCAm = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23,
                                           -1, -1, 24, -1, -1, 25, -1, -1,
                                           -1,  5, -1, -1,  6, -1, -1,  7,
                                           -1, -1,  8, -1, -1,  9, -1, -1);

           __m256i RAC = _mm256_shuffle_epi8(R0, RACm);
           __m256i GAC = _mm256_shuffle_epi8(G0, GACm);
           __m256i BAC = _mm256_shuffle_epi8(B0, BACm);

           __m256i AC0 = _mm256_or_si256(RAC, GAC);
           AC0 = _mm256_or_si256(AC0, BAC);

           __m256i RBB = _mm256_shuffle_epi8(R0, RBBm);
           __m256i GBB = _mm256_shuffle_epi8(G0, GBBm);
           __m256i BBB = _mm256_shuffle_epi8(B0, BBBm);

           __m256i BB0 = _mm256_or_si256(RBB, GBB);
           BB0 = _mm256_or_si256(BB0, BBB);

           __m256i RCA = _mm256_shuffle_epi8(R0, RCAm);
           __m256i GCA = _mm256_shuffle_epi8(G0, GCAm);
           __m256i BCA = _mm256_shuffle_epi8(B0, BCAm);

           __m256i CA0 = _mm256_or_si256(RCA, GCA);
           CA0 = _mm256_or_si256(CA0, BCA);

           __m256i AA0 = _mm256_permute2f128_si256(AC0, CA0, 2 << 4);
           __m256i CC0 = _mm256_permute2f128_si256(AC0, CA0, (1 << 4) | 3);

           RAC = _mm256_shuffle_epi8(R1, RACm);
           GAC = _mm256_shuffle_epi8(G1, GACm);
           BAC = _mm256_shuffle_epi8(B1, BACm);

           __m256i AC1 = _mm256_or_si256(RAC, GAC);
           AC1 = _mm256_or_si256(AC1, BAC);

           RBB = _mm256_shuffle_epi8(R1, RBBm);
           GBB = _mm256_shuffle_epi8(G1, GBBm);
           BBB = _mm256_shuffle_epi8(B1, BBBm);

           __m256i BB1 = _mm256_or_si256(RBB, GBB);
           BB1 = _mm256_or_si256(BB1, BBB);

           RCA = _mm256_shuffle_epi8(R1, RCAm);
           GCA = _mm256_shuffle_epi8(G1, GCAm);
           BCA = _mm256_shuffle_epi8(B1, BCAm);

           __m256i CA1 = _mm256_or_si256(RCA, GCA);
           CA1 = _mm256_or_si256(CA1, BCA);

           __m256i AA1 = _mm256_permute2f128_si256(AC1, CA1, 2 << 4);
           __m256i CC1 = _mm256_permute2f128_si256(AC1, CA1, (1 << 4) | 3);

           __m512i A = {mergeA0B0};
           __m512i B = {mergeC0A1};
           __m512i C = {mergeB1C1};

           {store}'''. \
           format(mergeA0B0=x86.setr(simd_ext, typ, 'AA0', 'BB0'),
                  mergeC0A1=x86.setr(simd_ext, typ, 'CC0', 'AA1'),
                  mergeB1C1=x86.setr(simd_ext, typ, 'BB1', 'CC1'),
                  store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'),
                  **fmtspec)
    if typ in ['i16', 'u16']:
        return \
        '''__m256i R0a = {exlo_in1};
           __m256i R0b = {exhi_in1};
           __m256i G0a = {exlo_in2};
           __m256i G0b = {exhi_in2};
           __m256i B0a = {exlo_in3};
           __m256i B0b = {exhi_in3};

           __m256i RACm = _mm256_setr_epi8( 0,  1, -1, -1, -1, -1,  2,  3,
                                           -1, -1, -1, -1,  4,  5, -1, -1,
                                           -1, -1, -1, -1, 12, 13, -1, -1,
                                           -1, -1, 14, 15, -1, -1, -1, -1);
           __m256i RBBm = _mm256_setr_epi8(-1, -1, -1, -1, 12, 13, -1, -1,
                                           -1, -1, 14, 15, -1, -1, -1, -1,
                                            0,  1, -1, -1, -1, -1,  2,  3,
                                           -1, -1, -1, -1,  4,  5, -1, -1);
           __m256i RCAm = _mm256_setr_epi8(-1, -1,  6,  7, -1, -1, -1, -1,
                                            8,  9, -1, -1, -1, -1, 10, 11,
                                           -1, -1,  6,  7, -1, -1, -1, -1,
                                            8,  9, -1, -1, -1, -1, 10, 11);

           __m256i GACm = _mm256_setr_epi8(-1, -1,  0,  1, -1, -1, -1, -1,
                                            2,  3, -1, -1, -1, -1,  4,  5,
                                           10, 11, -1, -1, -1, -1, 12, 13,
                                           -1, -1, -1, -1, 14, 15, -1, -1);
           __m256i GBBm = _mm256_setr_epi8(10, 11, -1, -1, -1, -1, 12, 13,
                                           -1, -1, -1, -1, 14, 15, -1, -1,
                                           -1, -1,  0,  1, -1, -1, -1, -1,
                                            2,  3, -1, -1, -1, -1,  4,  5);
           __m256i GCAm = _mm256_setr_epi8(-1, -1, -1, -1,  6,  7, -1, -1,
                                           -1, -1,  8,  9, -1, -1, -1, -1,
                                           -1, -1, -1, -1,  6,  7, -1, -1,
                                           -1, -1,  8,  9, -1, -1, -1, -1);

           __m256i BACm = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1, -1, -1,
                                           -1, -1,  2,  3, -1, -1, -1, -1,
                                           -1, -1, 10, 11, -1, -1, -1, -1,
                                           12, 13, -1, -1, -1, -1, 14, 15);
           __m256i BBBm = _mm256_setr_epi8(-1, -1, 10, 11, -1, -1, -1, -1,
                                           12, 13, -1, -1, -1, -1, 14, 15,
                                           -1, -1, -1, -1,  0,  1, -1, -1,
                                           -1, -1,  2,  3, -1, -1, -1, -1);
           __m256i BCAm = _mm256_setr_epi8( 4,  5, -1, -1, -1, -1,  6,  7,
                                           -1, -1, -1, -1,  8,  9, -1, -1,
                                            4,  5, -1, -1, -1, -1,  6,  7,
                                           -1, -1, -1, -1,  8,  9, -1, -1);

           __m256i RAC = _mm256_shuffle_epi8(R0a, RACm);
           __m256i GAC = _mm256_shuffle_epi8(G0a, GACm);
           __m256i BAC = _mm256_shuffle_epi8(B0a, BACm);

           __m256i RBB = _mm256_shuffle_epi8(R0a, RBBm);
           __m256i GBB = _mm256_shuffle_epi8(G0a, GBBm);
           __m256i BBB = _mm256_shuffle_epi8(B0a, BBBm);

           __m256i RCA = _mm256_shuffle_epi8(R0a, RCAm);
           __m256i GCA = _mm256_shuffle_epi8(G0a, GCAm);
           __m256i BCA = _mm256_shuffle_epi8(B0a, BCAm);

           __m256i AC = _mm256_or_si256(RAC, GAC);
           AC = _mm256_or_si256(AC, BAC);

           __m256i BBa = _mm256_or_si256(RBB, GBB);
           BBa = _mm256_or_si256(BBa, BBB);

           __m256i CA = _mm256_or_si256(RCA, GCA);
           CA = _mm256_or_si256(CA, BCA);

           __m256i AAa = _mm256_permute2f128_si256(AC, CA, 2 << 4);
           __m256i CCa = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);

           RAC = _mm256_shuffle_epi8(R0b, RACm);
           GAC = _mm256_shuffle_epi8(G0b, GACm);
           BAC = _mm256_shuffle_epi8(B0b, BACm);

           RBB = _mm256_shuffle_epi8(R0b, RBBm);
           GBB = _mm256_shuffle_epi8(G0b, GBBm);
           BBB = _mm256_shuffle_epi8(B0b, BBBm);

           RCA = _mm256_shuffle_epi8(R0b, RCAm);
           GCA = _mm256_shuffle_epi8(G0b, GCAm);
           BCA = _mm256_shuffle_epi8(B0b, BCAm);

           AC = _mm256_or_si256(RAC, GAC);
           AC = _mm256_or_si256(AC, BAC);

           __m256i BBb = _mm256_or_si256(RBB, GBB);
           BBb = _mm256_or_si256(BBb, BBB);

           CA = _mm256_or_si256(RCA, GCA);
           CA = _mm256_or_si256(CA, BCA);

           __m256i AAb = _mm256_permute2f128_si256(AC, CA, 2 << 4);
           __m256i CCb = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);

           __m512i A = {mergeAaBa};
           __m512i B = {mergeCaAb};
           __m512i C = {mergeBbCb};

           {store}'''. \
           format(mergeAaBa=x86.setr(simd_ext, typ, 'AAa', 'BBa'),
                  mergeCaAb=x86.setr(simd_ext, typ, 'CCa', 'AAb'),
                  mergeBbCb=x86.setr(simd_ext, typ, 'BBb', 'CCb'),
                  store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'),
                  **fmtspec)
    if typ in ['f32', 'i32', 'u32']:
        return \
        '''__m512i ARGm  = _mm512_setr_epi32( 0, 16,  0,  1, 17,  0,  2, 18,
                                              0,  3, 19,  0,  4, 20,  0,  5);
           __m512i ARGBm = _mm512_setr_epi32( 0,  1, 16,  3,  4, 17,  6,  7,
                                             18,  9, 10, 19, 12, 13, 20, 15);
           __m512i BRGm  = _mm512_setr_epi32(21,  0,  6, 22,  0,  7, 23,  0,
                                              8, 24,  0,  9, 25,  0, 10, 26);
           __m512i BRGBm = _mm512_setr_epi32( 0, 21,  2,  3, 22,  5,  6, 23,
                                              8,  9, 24, 11, 12, 25, 14, 15);
           __m512i CRGm  = _mm512_setr_epi32( 0, 11, 27,  0, 12, 28,  0, 13,
                                             29,  0, 14, 30,  0, 15, 31,  0);
           __m512i CRGBm = _mm512_setr_epi32(26,  1,  2, 27,  4,  5, 28,  7,
                                              8, 29, 10, 11, 30, 13, 14, 31);

           {styp} A = _mm512_permutex2var{suf}({in1}, ARGm, {in2});
           A = _mm512_permutex2var{suf}(A, ARGBm, {in3});
           {styp} B = _mm512_permutex2var{suf}({in1}, BRGm, {in2});
           B = _mm512_permutex2var{suf}(B, BRGBm, {in3});
           {styp} C = _mm512_permutex2var{suf}({in1}, CRGm, {in2});
           C = _mm512_permutex2var{suf}(C, CRGBm, {in3});

           {store}'''. \
           format(store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'),
                  **fmtspec)
    if typ in ['f64', 'i64', 'u64']:
        return \
        '''__m512i A_mask0 = _mm512_set_epi64(10,  2,  0,  9,  1,  0,  8,  0);
           __m512i A_mask1 = _mm512_set_epi64( 7,  6,  9,  4,  3,  8,  1,  0);
           {styp} A1 = _mm512_permutex2var{suf}({in1}, A_mask0, {in2});
           {styp} A2 = _mm512_permutex2var{suf}(A1, A_mask1, {in3});

           __m512i B_mask0 = _mm512_set_epi64( 5,  0, 12,  4,  0, 11,  3,  0);
           __m512i B_mask1 = _mm512_set_epi64( 7, 12,  5,  4, 11,  2,  1, 10);
           {styp} B1 = _mm512_permutex2var{suf}({in1}, B_mask0, {in2});
           {styp} B2 = _mm512_permutex2var{suf}(B1, B_mask1, {in3});

           __m512i C_mask0 = _mm512_set_epi64( 0, 15,  7,  0, 14,  6,  0, 13);
           __m512i C_mask1 = _mm512_set_epi64(15,  6,  5, 14,  3,  2, 13,  0);
           {styp} C1 = _mm512_permutex2var{suf}({in1}, C_mask0, {in2});
           {styp} C2 = _mm512_permutex2var{suf}(C1, C_mask1, {in3});

           {store}'''. \
           format(store=store3(simd_ext, typ, align, fmtspec,
                  'A2', 'B2', 'C2'), **fmtspec)


================================================
FILE: examples/module_fixed_point.cpp
================================================
// Copyright (c) 2019 Agenium Scale
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

#include <ctime>
#include <cstdlib>
#include <iostream>
#include <nsimd/modules/fixed_point.hpp>

float rand_float() {
  return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f;        
}

int main() {
  // We use fixed point numbers with 8 bits of integer part and 8 bits of 
  // decimal part. It will use 32 bits integers for internal storage.
  typedef nsimd::fixed_point::fp_t<8, 8> fp_t;
  typedef nsimd::fixed_point::pack<fp_t> fp_pack_t;
  
  const size_t v_size = nsimd::fixed_point::len(fp_t());

  fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t));
  fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t));
  fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t));
  
  // Input and output initializations 
  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {
    input0[i] = fp_t(rand_float());
    input1[i] = fp_t(rand_float());
  }
  
  fp_pack_t v0 = nsimd::fixed_point::loadu<fp_pack_t>(input0);
  fp_pack_t v1 = nsimd::fixed_point::loadu<fp_pack_t>(input1);
  fp_pack_t vres = nsimd::fixed_point::add(v0, v1);
  nsimd::fixed_point::storeu(res, vres);
  
  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {
    std::cout << float(input0[i]) << " | "
      << float(input1[i]) << " | "
      << float(res[i]) << "\n";
  }
  std::cout << std::endl;
  
  return EXIT_SUCCESS;
}


================================================
FILE: examples/tutorial.cpp
================================================
#include <nsimd/nsimd-all.hpp>

#include <string>
#include <vector>
#include <iostream>

template <typename T>
void uppercase_scalar(T *dst, const T *src, int n) {
  for (int i = 0; i < n; i++) {
    if (src[i] >= 'a' && src[i] <= 'z') {
      dst[i] = src[i] + ('A' - 'a');
    } else {
      dst[i] = src[i];
    }
  }
}

template <typename T>
void uppercase_simd(T *dst, const T *src, int n) {
  using namespace nsimd;
  typedef pack<T> p_t;
  typedef packl<T> pl_t;
  int l = len<p_t>();

  int i;
  for (i = 0; i + l <= n; i += l) {
    p_t text = loadu<p_t>(src + i);
    pl_t mask = text >= 'a' && text <= 'z';
    p_t then_pack = text + ('A' - 'a');
    p_t TEXT = if_else(mask, then_pack, text);
    storeu(dst + i, TEXT);
  }

  pl_t mask = mask_for_loop_tail<pl_t>(i, n);
  p_t text = maskz_loadu(mask, src + i);
  p_t TEXT = if_else(text >= 'a' && text <= 'z', text + ('A' - 'a'), text);
  mask_storeu(mask, dst + i, TEXT);
}

int main(int argc, char **argv) {
  std::string input;

  for (int i = 1; i < argc; i++) {
    input += std::string(argv[i]);
    if (i < argc - 1) {
      input += std::string(" ");
    }
  }

  std::cout << "Orignal text         : " << input << std::endl;

  std::vector<i8> dst_scalar(input.size() + 1);
  uppercase_scalar(&dst_scalar[0], (i8 *)input.c_str(), (int)input.size());
  std::cout << "Scalar uppercase text: " << &dst_scalar[0] << std::endl;

  std::vector<i8> dst_simd(input.size() + 1);
  uppercase_simd(&dst_simd[0], (i8 *)input.c_str(), (int)input.size());
  std::cout << "NSIMD uppercase text : " << &dst_simd[0] << std::endl;

  return 0;
}


================================================
FILE: include/nsimd/c_adv_api.h
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_C_ADV_API_H
#define NSIMD_C_ADV_API_H

#include <nsimd/nsimd.h>

#if NSIMD_C >= 2011

NSIMD_INLINE void nsimd_c11_type_unsupported(void) {}

/* ------------------------------------------------------------------------- */

#include <nsimd/c_adv_api_functions.h>

/* ------------------------------------------------------------------------- */
/* We add by hand parametrized loads/stores. */

/* loads */

#define nsimd_load_aligned(type, ptr) nsimd_loada(type, ptr)
#define nsimd_load_unaligned(type, ptr) nsimd_loadu(type, ptr)

#define nsimd_load(alignment, type, ptr)                                      \
  NSIMD_PP_CAT_2(nsimd_load_, alignment)(type, ptr)

/* stores */

#define nsimd_store_aligned(ptr, vec) nsimd_storea(ptr, vec)
#define nsimd_store_unaligned(ptr, vec) nsimd_storeu(ptr, vec)

#define nsimd_store(alignment, ptr, vec)                                      \
  NSIMD_PP_CAT_2(nsimd_store_, alignment)(ptr, vec)

/* ------------------------------------------------------------------------- */
/* Generic types */

#define nsimd_pack(type) NSIMD_PP_CAT_2(nsimd_pack_, type)
#define nsimd_packl(type) NSIMD_PP_CAT_2(nsimd_packl_, type)
#define nsimd_packx2(type) NSIMD_PP_CAT_2(nsimd_packx2_, type)
#define nsimd_packx3(type) NSIMD_PP_CAT_2(nsimd_packx3_, type)
#define nsimd_packx4(type) NSIMD_PP_CAT_2(nsimd_packx4_, type)

#define nsimd_pack_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_pack_, type)
#define nsimd_packl_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packl_, type)
#define nsimd_packx2_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx2_, type)
#define nsimd_packx3_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx3_, type)
#define nsimd_packx4_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx4_, type)

#endif /* NSIMD_C >= 2011 */

#endif /* NSIMD_C_ADV_API_HPP */


================================================
FILE: include/nsimd/cxx_adv_api.hpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_CXX_ADV_API_HPP
#define NSIMD_CXX_ADV_API_HPP

#include <nsimd/nsimd.h>
#include <ostream>

// ----------------------------------------------------------------------------

namespace nsimd {

// ----------------------------------------------------------------------------
// "mimic" static_assert in C++98

template <bool> struct nsimd_static_assert;
template <> struct nsimd_static_assert<true> {};

// ----------------------------------------------------------------------------
// Definition of pack

template <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,
          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>
NSIMD_STRUCT pack;

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT pack<T, 1, SimdExt> {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = 1;
  static const int soa_num_packs = 1;

  simd_vector car;

  // Default ctor
  pack() {}

  // Ctor that splats
  template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s) {
    car = set1(T(s), T(), SimdExt());
  }

  // Ctor taking a SIMD vector
  pack(simd_vector v) { car = v; }

  // Underlying native SIMD vector getter
  simd_vector native_register() const { return car; }

  // Arithmetic and assignment operators
  pack &operator+=(pack const &other);
  pack &operator-=(pack const &other);
  pack &operator*=(pack const &other);
  pack &operator/=(pack const &other);
  pack &operator|=(pack const &other);
  pack &operator&=(pack const &other);
  pack &operator^=(pack const &other);
  pack &operator<<=(int);
  pack &operator>>=(int);

  // For std::cout'ing a pack
  friend std::ostream &operator<<(std::ostream &os, pack const &a0) {
    T buf[max_len_t<T>::value];
    storeu(buf, a0.car, T(), SimdExt());
    os << "{ ";
    int n = len(a0);
    for (int i = 0; i < n; i++) {
      os << to_biggest(buf[i]);
      if (i < n - 1) {
        os << ", ";
      }
    }
    os << " }";
    return os;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT pack {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 1;

  simd_vector car;
  pack<T, N - 1, SimdExt> cdr;

  // Default ctor
  pack() {}

  // Ctor that splats
  template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s) : cdr(s) {
    car = set1(T(s), T(), SimdExt());
  }

  // Arithmetic and assignment operators
  pack &operator+=(pack const &other);
  pack &operator-=(pack const &other);
  pack &operator*=(pack const &other);
  pack &operator/=(pack const &other);
  pack &operator|=(pack const &other);
  pack &operator&=(pack const &other);
  pack &operator^=(pack const &other);
  pack &operator<<=(int);
  pack &operator>>=(int);

  // For std::cout'ing a pack
  friend std::ostream &operator<<(std::ostream &os, pack const &a0) {
    os << pack<T, 1, SimdExt>(a0.car) << ", " << a0.cdr;
    return os;
  }
};

#if NSIMD_CXX >= 2020
template <typename T> struct is_pack_t : public std::false_type {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct is_pack_t<pack<T, N, SimdExt> > : public std::true_type {};

template <typename T> concept is_pack_c = is_pack_t<T>::value;
#define NSIMD_CONCEPT_PACK nsimd::is_pack_c
#else
#define NSIMD_CONCEPT_PACK typename
#endif

// ----------------------------------------------------------------------------
// Definition of logical

template <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,
          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>
NSIMD_STRUCT packl;

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packl<T, 1, SimdExt> {
  typedef typename simd_traits<T, SimdExt>::simd_vectorl simd_vectorl;
  simd_vectorl car;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = 1;

  // Default ctor
  packl() {}

  // Ctor taking a SIMD vector
  packl(simd_vectorl v) { car = v; }

  // Ctor that splats
  template <NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL S> packl(S const &s) {
    car = set1l(int(s), T(), SimdExt());
  }

  // Underlying native SIMD vector getter
  simd_vectorl native_register() const { return car; }

  // For std::cout'ing a packl
  friend std::ostream &operator<<(std::ostream &os, packl const &a0) {
    T buf[max_len_t<T>::value];
    storelu(buf, a0.car, T(), SimdExt());
    os << "{ ";
    int n = len(a0);
    for (int i = 0; i < n; i++) {
      os << buf[i];
      if (i < n - 1) {
        os << ", ";
      }
    }
    os << " }";
    return os;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packl {
  typename simd_traits<T, SimdExt>::simd_vectorl car;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;

  packl<T, N - 1, SimdExt> cdr;

  // Default ctor
  packl() {}

  // Ctor that splats
  template <NSIMD_CONCEPT_VALUE_TYPE S> packl(S const &s) : cdr(s) {
    car = set1l(int(s), T(), SimdExt());
  }

  // For std::cout'ing a packl
  friend std::ostream &operator<<(std::ostream &os, packl const &a0) {
    os << packl<T, 1, SimdExt>(a0.car) << ", " << a0.cdr;
    return os;
  }
};

#if NSIMD_CXX >= 2020
template <typename T> struct is_packl_t : public std::false_type {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct is_packl_t<packl<T, N, SimdExt> > : public std::true_type {};

template <typename T> concept is_packl_c = is_packl_t<T>::value;
#define NSIMD_CONCEPT_PACKL nsimd::is_packl_c
#else
#define NSIMD_CONCEPT_PACKL typename
#endif

// ----------------------------------------------------------------------------
// Definition of SOA of degree 1

template <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,
          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>
NSIMD_STRUCT packx1;

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx1<T, 1, SimdExt> {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = 1;
  static const int soa_num_packs = 1;

  pack<T, 1, SimdExt> v0;

  void set_car(simd_vector v0_) {
    v0.car = v0_;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx1 {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 1;

  pack<T, N, SimdExt> v0;

  void set_car(simd_vector v0_) {
    v0.car = v0_;
  }

  void set_cdr(pack<T, N - 1, SimdExt> const &v0_) {
    v0.cdr = v0_;
  }
};

#if NSIMD_CXX >= 2020
template <typename T> struct is_packx1_t : public std::false_type {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct is_packx1_t<packx1<T, N, SimdExt> > : public std::true_type {};

template <typename T> concept is_packx1_c = is_packx1_t<T>::value;
#define NSIMD_CONCEPT_PACKX1 nsimd::is_packx1_c
#else
#define NSIMD_CONCEPT_PACKX1 typename
#endif

// ----------------------------------------------------------------------------
// Definition of SOA of degree 2

template <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,
          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>
NSIMD_STRUCT packx2;

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx2<T, 1, SimdExt> {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = 1;
  static const int soa_num_packs = 2;

  pack<T, 1, SimdExt> v0;
  pack<T, 1, SimdExt> v1;

  void set_car(simd_vector v0_, simd_vector v1_) {
    v0.car = v0_;
    v1.car = v1_;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx2 {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 2;

  pack<T, N, SimdExt> v0;
  pack<T, N, SimdExt> v1;

  void set_car(simd_vector v0_, simd_vector v1_) {
    v0.car = v0_;
    v1.car = v1_;
  }

  void set_cdr(pack<T, N - 1, SimdExt> const &v0_,
               pack<T, N - 1, SimdExt> const &v1_) {
    v0.cdr = v0_;
    v1.cdr = v1_;
  }
};

#if NSIMD_CXX >= 2020
template <typename T> struct is_packx2_t : public std::false_type {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct is_packx2_t<packx2<T, N, SimdExt> > : public std::true_type {};

template <typename T> concept is_packx2_c = is_packx2_t<T>::value;
#define NSIMD_CONCEPT_PACKX2 nsimd::is_packx2_c
#else
#define NSIMD_CONCEPT_PACKX2 typename
#endif

// ----------------------------------------------------------------------------
// Definition of SOA of degree 3

template <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,
          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>
NSIMD_STRUCT packx3;

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx3<T, 1, SimdExt> {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = 1;
  static const int soa_num_packs = 3;

  pack<T, 1, SimdExt> v0;
  pack<T, 1, SimdExt> v1;
  pack<T, 1, SimdExt> v2;

  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_) {
    v0.car = v0_;
    v1.car = v1_;
    v2.car = v2_;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx3 {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 3;

  pack<T, N, SimdExt> v0;
  pack<T, N, SimdExt> v1;
  pack<T, N, SimdExt> v2;

  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_) {
    v0.car = v0_;
    v1.car = v1_;
    v2.car = v2_;
  }

  void set_cdr(pack<T, N - 1, SimdExt> const &v0_,
               pack<T, N - 1, SimdExt> const &v1_,
               pack<T, N - 1, SimdExt> const &v2_) {
    v0.cdr = v0_;
    v1.cdr = v1_;
    v2.cdr = v2_;
  }
};

#if NSIMD_CXX >= 2020
template <typename T> struct is_packx3_t : public std::false_type {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct is_packx3_t<packx3<T, N, SimdExt> > : public std::true_type {};

template <typename T> concept is_packx3_c = is_packx3_t<T>::value;
#define NSIMD_CONCEPT_PACKX3 nsimd::is_packx3_c
#else
#define NSIMD_CONCEPT_PACKX3 typename
#endif

// ----------------------------------------------------------------------------
// Definition of SOA of degree 4

template <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,
          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>
NSIMD_STRUCT packx4;

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx4<T, 1, SimdExt> {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = 1;
  static const int soa_num_packs = 4;

  pack<T, 1, SimdExt> v0;
  pack<T, 1, SimdExt> v1;
  pack<T, 1, SimdExt> v2;
  pack<T, 1, SimdExt> v3;

  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_,
               simd_vector v3_) {
    v0.car = v0_;
    v1.car = v1_;
    v2.car = v2_;
    v3.car = v3_;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx4 {
  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
  typedef T value_type;
  typedef SimdExt simd_ext;
  static const int unroll = N;
  static const int soa_num_packs = 4;

  pack<T, N, SimdExt> v0;
  pack<T, N, SimdExt> v1;
  pack<T, N, SimdExt> v2;
  pack<T, N, SimdExt> v3;

  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_,
               simd_vector v3_) {
    v0.car = v0_;
    v1.car = v1_;
    v2.car = v2_;
    v3.car = v3_;
  }

  void set_cdr(
      pack<T, N - 1, SimdExt> const &v0_, pack<T, N - 1, SimdExt> const &v1_,
      pack<T, N - 1, SimdExt> const &v2_, pack<T, N - 1, SimdExt> const &v3_) {
    v0.cdr = v0_;
    v1.cdr = v1_;
    v2.cdr = v2_;
    v3.cdr = v3_;
  }
};

#if NSIMD_CXX >= 2020
template <typename T> struct is_packx4_t : public std::false_type {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct is_packx4_t<packx4<T, N, SimdExt> > : public std::true_type {};

template <typename T> concept is_packx4_c = is_packx4_t<T>::value;
#define NSIMD_CONCEPT_PACKX4 nsimd::is_packx4_c
#else
#define NSIMD_CONCEPT_PACKX4 typename
#endif

// ----------------------------------------------------------------------------
// A C++20 concept

#if NSIMD_CXX >=2020
template <typename T>
concept any_pack_c = is_pack_c<T> || is_packl_c<T> || is_packx1_c<T> ||
                     is_packx2_c<T> || is_packx3_c<T> || is_packx4_c<T>;
#define NSIMD_CONCEPT_ANY_PACK nsimd::any_pack_c
#else
#define NSIMD_CONCEPT_ANY_PACK typename
#endif

// ----------------------------------------------------------------------------
// The len function cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int len(pack<T, N, SimdExt> const &) {
  return N * len(T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int len(packl<T, N, SimdExt> const &) {
  return N * len(T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int len(packx1<T, N, SimdExt> const &) {
  return N * len(T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int len(packx2<T, N, SimdExt> const &) {
  return 2 * N * len(T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int len(packx3<T, N, SimdExt> const &) {
  return 3 * N * len(T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int len(packx4<T, N, SimdExt> const &) {
  return 4 * N * len(T(), SimdExt());
}

template <NSIMD_CONCEPT_ANY_PACK Pack> int len() { return len(Pack()); }

// ----------------------------------------------------------------------------
// The addv function cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
T addv(pack<T, 1, SimdExt> const &a0) {
  return addv(a0.car, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
T addv(pack<T, N, SimdExt> const &a0) {
  return addv(a0.car, T(), SimdExt()) + addv(a0.cdr);
}

// ----------------------------------------------------------------------------
// The all function cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int all(packl<T, 1, SimdExt> const &a0) {
  return all(a0.car, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int all(packl<T, N, SimdExt> const &a0) {
  return all(a0.car, T(), SimdExt()) && all(a0.cdr);
}

// ----------------------------------------------------------------------------
// The any function cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int any(packl<T, 1, SimdExt> const &a0) {
  return any(a0.car, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int any(packl<T, N, SimdExt> const &a0) {
  return any(a0.car, T(), SimdExt()) || any(a0.cdr);
}

// ----------------------------------------------------------------------------
// The nbtrue function cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int nbtrue(packl<T, 1, SimdExt> const &a0) {
  return nbtrue(a0.car, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
int nbtrue(packl<T, N, SimdExt> const &a0) {
  return nbtrue(a0.car, T(), SimdExt()) + nbtrue(a0.cdr);
}

// ----------------------------------------------------------------------------
// Include functions that act on packs

} // namespace nsimd

#include <nsimd/cxx_adv_api_functions.hpp>

namespace nsimd {

// ----------------------------------------------------------------------------
// Arithmetic and assignment operators

// add
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator+=(pack<T, 1, SimdExt> const &other) {
  this->car = add(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator+=(pack<T, N, SimdExt> const &other) {
  this->car = add(this->car, other.car, T());
  this->cdr += other.cdr;
  return *this;
}

// sub
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator-=(pack<T, 1, SimdExt> const &other) {
  this->car = sub(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator-=(pack<T, N, SimdExt> const &other) {
  this->car = sub(this->car, other.car, T());
  this->cdr -= other.cdr;
  return *this;
}

// mul
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator*=(pack<T, 1, SimdExt> const &other) {
  this->car = mul(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator*=(pack<T, N, SimdExt> const &other) {
  this->car = mul(this->car, other.car, T());
  this->cdr *= other.cdr;
  return *this;
}

// div
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator/=(pack<T, 1, SimdExt> const &other) {
  this->car = div(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator/=(pack<T, N, SimdExt> const &other) {
  this->car = div(this->car, other.car, T());
  this->cdr /= other.cdr;
  return *this;
}

// orb
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator|=(pack<T, 1, SimdExt> const &other) {
  this->car = orb(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator|=(pack<T, N, SimdExt> const &other) {
  this->car = orb(this->car, other.car, T());
  this->cdr |= other.cdr;
  return *this;
}

// andb
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator&=(pack<T, 1, SimdExt> const &other) {
  this->car = andb(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator&=(pack<T, N, SimdExt> const &other) {
  this->car = andb(this->car, other.car, T());
  this->cdr &= other.cdr;
  return *this;
}

// xorb
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::
operator^=(pack<T, 1, SimdExt> const &other) {
  this->car = xorb(this->car, other.car, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::
operator^=(pack<T, N, SimdExt> const &other) {
  this->car = xorb(this->car, other.car, T());
  this->cdr ^= other.cdr;
  return *this;
}

// left shift
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::operator<<=(int s) {
  this->car = shl(this->car, s, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::operator<<=(int s) {
  this->car = shl(this->car, s, T());
  this->cdr <<= s;
  return *this;
}

// right shift
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> &pack<T, 1, SimdExt>::operator>>=(int s) {
  this->car = shr(this->car, s, T());
  return *this;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> &pack<T, N, SimdExt>::operator>>=(int s) {
  this->car = shr(this->car, s, T());
  this->cdr >>= s;
  return *this;
}

// ----------------------------------------------------------------------------
// The if_else function cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
pack<T, 1, SimdExt>
if_else(packl<L, 1, SimdExt> const &a0, pack<T, 1, SimdExt> const &a1,
        pack<T, 1, SimdExt> const &a2) {
  pack<T, 1, SimdExt> ret;
  ret.car = if_else(a0.car, a1.car, a2.car, L(), T(), SimdExt());
  return ret;
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
pack<T, N, SimdExt>
if_else(packl<L, N, SimdExt> const &a0, pack<T, N, SimdExt> const &a1,
        pack<T, N, SimdExt> const &a2) {
  pack<T, N, SimdExt> ret;
  ret.car = if_else(a0.car, a1.car, a2.car, L(), T(), SimdExt());
  ret.cdr = if_else(a0.cdr, a1.cdr, a2.cdr);
  return ret;
}

// ----------------------------------------------------------------------------
// Mask loads and stores cannot be auto-generated

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
void mask_storea(packl<L, N, SimdExt> const &a0, T *a1,
                 pack<T, N, SimdExt> const &a2) {
  mask_storea1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
void mask_storeu(packl<L, N, SimdExt> const &a0, T *a1,
                 pack<T, N, SimdExt> const &a2) {
  mask_storeu1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
pack<T, N, SimdExt> maskz_loada(packl<L, N, SimdExt> const &a0, const T *a1) {
  return maskz_loada1(reinterpretl<packl<T, N, SimdExt> >(a0), a1);
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
pack<T, N, SimdExt> maskz_loadu(packl<L, N, SimdExt> const &a0, const T *a1) {
  return maskz_loadu1(reinterpretl<packl<T, N, SimdExt> >(a0), a1);
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
pack<T, N, SimdExt> masko_loada(packl<L, N, SimdExt> const &a0, const T *a1,
                                pack<T, N, SimdExt> const &a2) {
  return masko_loada1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
pack<T, N, SimdExt> masko_loadu(packl<L, N, SimdExt> const &a0, const T *a1,
                                pack<T, N, SimdExt> const &a2) {
  return masko_loadu1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);
}

// ----------------------------------------------------------------------------
// Loads/Stores templated on the alignment cannot be auto-generated

namespace detail {

template <NSIMD_CONCEPT_PACKL P> struct loadz_return_t {
  typedef nsimd::pack<typename P::value_type, P::unroll, typename P::simd_ext>
      type;
};

template <NSIMD_CONCEPT_ANY_PACK SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>
struct load_helper {};

template <NSIMD_CONCEPT_ANY_PACK SimdVector>
struct load_helper<SimdVector, aligned> {
  typedef typename SimdVector::value_type T;
  typedef typename SimdVector::simd_ext simd_ext;
  static const int N = SimdVector::unroll;

  static SimdVector load(const T *a0) { return loada<SimdVector>(a0); }
  static SimdVector loadl(const T *a0) { return loadla<SimdVector>(a0); }
  static SimdVector load2(const T *a0) { return load2a<SimdVector>(a0); }
  static SimdVector load3(const T *a0) { return load3a<SimdVector>(a0); }
  static SimdVector load4(const T *a0) { return load4a<SimdVector>(a0); }

  static SimdVector maskz_load(packl<T, N, simd_ext> const &a0, const T *a1) {
    return maskz_loada(a0, a1);
  }

  static pack<T, N, simd_ext> masko_load(packl<T, N, simd_ext> const &a0,
                                         const T *a1,
                                         pack<T, N, simd_ext> const &a2) {
    return masko_loada(a0, a1, a2);
  }
};

template <typename SimdVector> struct load_helper<SimdVector, unaligned> {
  typedef typename SimdVector::value_type T;
  typedef typename SimdVector::simd_ext simd_ext;
  static const int N = SimdVector::unroll;

  static SimdVector load(const T *a0) { return loadu<SimdVector>(a0); }
  static SimdVector loadl(const T *a0) { return loadlu<SimdVector>(a0); }
  static SimdVector load2(const T *a0) { return load2u<SimdVector>(a0); }
  static SimdVector load3(const T *a0) { return load3u<SimdVector>(a0); }
  static SimdVector load4(const T *a0) { return load4u<SimdVector>(a0); }

  static SimdVector maskz_load(packl<T, N, simd_ext> const &a0, const T *a1) {
    return maskz_loadu(a0, a1);
  }

  static pack<T, N, simd_ext> masko_load(packl<T, N, simd_ext> const &a0,
                                         const T *a1,
                                         pack<T, N, simd_ext> const &a2) {
    return masko_loadu(a0, a1, a2);
  }
};

template <NSIMD_CONCEPT_ALIGNMENT Alignment> struct store_helper {};

#define NSIMD_T typename P::value_type

template <> struct store_helper<aligned> {
  template <NSIMD_CONCEPT_PACK P> static void store(NSIMD_T *a0, P const &a1) {
    storea(a0, a1);
  }

  template <NSIMD_CONCEPT_PACKL PL, NSIMD_CONCEPT_PACK P>
#if NSIMD_CXX >= 2020
  requires std::is_same_v<typename PL::value_type, typename P::value_type>
#endif
  static void mask_store(PL const &a0, NSIMD_T *a1, P const &a2) {
    mask_storea(a0, a1, a2);
  }

  template <NSIMD_CONCEPT_PACK P> static void storel(NSIMD_T *a0, P const &a1) {
    storela(a0, a1);
  }

  template <NSIMD_CONCEPT_PACK P>
  static void store2(NSIMD_T *a0, P const &a1, P const &a2) {
    store2a(a0, a1, a2);
  }

  template <NSIMD_CONCEPT_PACK P>
  static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) {
    store3a(a0, a1, a2, a3);
  }

  template <NSIMD_CONCEPT_PACK P>
  static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3,
                     P const &a4) {
    store4a(a0, a1, a2, a3, a4);
  }
};

template <> struct store_helper<unaligned> {
  template <NSIMD_CONCEPT_PACK P> static void store(NSIMD_T *a0, P const &a1) {
    storeu(a0, a1);
  }

  template <NSIMD_CONCEPT_PACKL PL, NSIMD_CONCEPT_PACK P>
#if NSIMD_CXX >= 2020
  requires std::is_same_v<typename PL::value_type, typename P::value_type>
#endif
  static void mask_store(PL const &a0, NSIMD_T *a1, P const &a2) {
    mask_storeu(a0, a1, a2);
  }

  template <NSIMD_CONCEPT_PACK P> static void storel(NSIMD_T *a0, P const &a1) {
    storelu(a0, a1);
  }

  template <NSIMD_CONCEPT_PACK P>
  static void store2(NSIMD_T *a0, P const &a1, P const &a2) {
    store2u(a0, a1, a2);
  }

  template <NSIMD_CONCEPT_PACK P>
  static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) {
    store3u(a0, a1, a2, a3);
  }

  template <NSIMD_CONCEPT_PACK P>
  static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3,
                     P const &a4) {
    store4u(a0, a1, a2, a3, a4);
  }
};

#undef NSIMD_T

} // namespace detail

template <NSIMD_CONCEPT_PACK SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>
SimdVector load(const typename SimdVector::value_type *ptr) {
  return detail::load_helper<SimdVector, Alignment>::load(ptr);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl>
pack<typename Packl::value_type, Packl::unroll, typename Packl::simd_ext>
maskz_load(Packl const &pl, const typename Packl::value_type *ptr) {
  return detail::load_helper<pack<typename Packl::value_type, Packl::unroll,
                                  typename Packl::simd_ext>,
                             Alignment>::maskz_load(pl, ptr);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl,
          NSIMD_CONCEPT_PACK Pack>
Pack masko_load(Packl const &pl, const typename Pack::value_type *ptr,
                Pack const &p) {
  return detail::load_helper<Pack, Alignment>::masko_load(pl, ptr, p);
}

template <NSIMD_CONCEPT_PACK SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>
SimdVector loadl(const typename SimdVector::value_type *ptr) {
  return detail::load_helper<SimdVector, Alignment>::loadl(ptr);
}

template <NSIMD_CONCEPT_PACKX2 SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>
SimdVector load2(const typename SimdVector::value_type *ptr) {
  return detail::load_helper<SimdVector, Alignment>::load2(ptr);
}

template <NSIMD_CONCEPT_PACKX3 SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>
SimdVector load3(const typename SimdVector::value_type *ptr) {
  return detail::load_helper<SimdVector, Alignment>::load3(ptr);
}

template <NSIMD_CONCEPT_PACKX4 SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>
SimdVector load4(const typename SimdVector::value_type *ptr) {
  return detail::load_helper<SimdVector, Alignment>::load4(ptr);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>
void store(typename Pack::value_type *ptr, Pack const &p) {
  detail::store_helper<Alignment>::store(ptr, p);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl,
          NSIMD_CONCEPT_PACK Pack>
void mask_store(Packl const &pl, typename Pack::value_type *ptr,
                Pack const &p) {
  detail::store_helper<Alignment>::mask_store(pl, ptr, p);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl>
void storel(typename Packl::value_type *ptr, Packl const &pl) {
  return detail::store_helper<Alignment>::storel(ptr, pl);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>
void store2(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2) {
  return detail::store_helper<Alignment>::store2(ptr, p1, p2);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>
void store3(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2,
            Pack const &p3) {
  return detail::store_helper<Alignment>::store3(ptr, p1, p2, p3);
}

template <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>
void store4(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2,
            Pack const &p3, Pack const &p4) {
  return detail::store_helper<Alignment>::store4(ptr, p1, p2, p3, p4);
}

// ----------------------------------------------------------------------------

template <typename T> T native_register(T a) { return a; }

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
typename pack<T, 1, SimdExt>::simd_vector
native_register(pack<T, 1, SimdExt> const &a) {
  return a.car;
}

// ----------------------------------------------------------------------------
// get_pack

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class packx, int Ix>
struct get_pack_helper {};

// ----------------------------------------------------------------------------
// get_pack_helper - packx1

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          int Ix>
struct get_pack_helper<T, N, SimdExt, packx1, Ix> {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx1, 0> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx1<T, N, SimdExt> &packx_) const {
    return packx_.v0;
  }
};

// ----------------------------------------------------------------------------
// get_pack_helper - packx2

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          int Ix>
struct get_pack_helper<T, N, SimdExt, packx2, Ix> {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx2, 0> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx2<T, N, SimdExt> &packx_) const {
    return packx_.v0;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx2, 1> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx2<T, N, SimdExt> &packx_) const {
    return packx_.v1;
  }
};

// ----------------------------------------------------------------------------
// get_pack_helper - packx3

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          int Ix>
struct get_pack_helper<T, N, SimdExt, packx3, Ix> {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx3, 0> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx3<T, N, SimdExt> &packx_) const {
    return packx_.v0;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx3, 1> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx3<T, N, SimdExt> &packx_) const {
    return packx_.v1;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx3, 2> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx3<T, N, SimdExt> &packx_) const {
    return packx_.v2;
  }
};

// ----------------------------------------------------------------------------
// get_pack_helper - packx4

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          int Ix>
struct get_pack_helper<T, N, SimdExt, packx4, Ix> {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx4, 0> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx4<T, N, SimdExt> &packx_) const {
    return packx_.v0;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx4, 1> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx4<T, N, SimdExt> &packx_) const {
    return packx_.v1;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx4, 2> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx4<T, N, SimdExt> &packx_) const {
    return packx_.v2;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct get_pack_helper<T, N, SimdExt, packx4, 3> {
  const nsimd::pack<T, N, SimdExt> &
  operator()(const packx4<T, N, SimdExt> &packx_) const {
    return packx_.v3;
  }
};

// ----------------------------------------------------------------------------
// get_pack
// get_pack for packx[Y]<T, 1..N, SimdExt> with Y = 1

template <int Ix, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> get_pack(const pack<T, N, SimdExt> &pack_) {
  nsimd_static_assert<0 == Ix>();
  return pack_;
}

// ----------------------------------------------------------------------------
// get_pack
// get_pack for packx[Y]<T, 1..N, SimdExt> with Y in {2, 3, 4}

template <int Ix, NSIMD_CONCEPT_VALUE_TYPE T, int N,
          NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class packx>
pack<T, N, SimdExt> get_pack(const packx<T, N, SimdExt> &packx_) {
  return get_pack_helper<T, N, SimdExt, packx, Ix>()(packx_);
}

// ----------------------------------------------------------------------------
// to_pack_trait

template <class _packx> struct to_pack_trait {};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class _packx>
struct to_pack_trait<_packx<T, N, SimdExt> > {
  typedef pack<T, _packx<T, N, SimdExt>::soa_num_packs * N, SimdExt>
      value_type;
};

// ----------------------------------------------------------------------------
// to_pack
// to_pack for packx[Y]<T, 1..N, SimdExt> with Y = 1

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> to_pack(const pack<T, 1, SimdExt> &pack_) {
  return pack_;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> to_pack(const pack<T, N, SimdExt> &pack_) {
  return pack_;
}

// ----------------------------------------------------------------------------
// to_pack
// to_pack for packx[Y]<T, N = 1, SimdExt> with Y in {2, 3, 4}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> to_pack(const packx1<T, 1, SimdExt> &packx_) {

  nsimd::pack<T, 1, SimdExt> pack_;
  pack_.car = packx_.v0.car;

  return pack_;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2, SimdExt> to_pack(const packx2<T, 1, SimdExt> &packx_) {

  nsimd::pack<T, 2, SimdExt> pack_;
  pack_.car = packx_.v0.car;
  pack_.cdr.car = packx_.v1.car;

  return pack_;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3, SimdExt> to_pack(const packx3<T, 1, SimdExt> &packx_) {

  nsimd::pack<T, 3, SimdExt> pack_;
  pack_.car = packx_.v0.car;
  pack_.cdr.car = packx_.v1.car;
  pack_.cdr.cdr.car = packx_.v2.car;
  return pack_;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4, SimdExt> to_pack(const packx4<T, 1, SimdExt> &packx_) {

  nsimd::pack<T, 4, SimdExt> pack_;
  pack_.car = packx_.v0.car;
  pack_.cdr.car = packx_.v1.car;
  pack_.cdr.cdr.car = packx_.v2.car;
  pack_.cdr.cdr.cdr.car = packx_.v3.car;

  return pack_;
}

// ----------------------------------------------------------------------------
// to_pack for packx[Y]<T, (N > 1), SimdExt> with Y in {2, 3, 4}

// Advance
template <NSIMD_CONCEPT_VALUE_TYPE T, int from_pack_init_N,
          int from_pack_unroll_ix, int to_pack_unroll_ix,
          int which_from_pack_ix, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class packx>
struct to_pack_recurs_helper {
  static pack<T, to_pack_unroll_ix, SimdExt>
  to_pack(const packx<T, from_pack_init_N, SimdExt> &from_packx,
          const pack<T, from_pack_unroll_ix, SimdExt> &from_pack) {
    pack<T, to_pack_unroll_ix, SimdExt> to_pack_;
    to_pack_.car = from_pack.car;
    to_pack_.cdr =
        to_pack_recurs_helper<T, from_pack_init_N, from_pack_unroll_ix - 1,
                              to_pack_unroll_ix - 1, which_from_pack_ix,
                              SimdExt, packx>::to_pack(from_packx,
                                                       from_pack.cdr);
    return to_pack_;
  }
};

// Base case
// Base case condition: to_pack_unroll_ix == 1
template <NSIMD_CONCEPT_VALUE_TYPE T, int from_pack_init_N,
          int which_from_pack_ix, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class packx>
struct to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_ix */,
                             1 /* to_pack_unroll_ix */, which_from_pack_ix,
                             SimdExt, packx> {
  static pack<T, 1, SimdExt>
  to_pack(const packx<T, from_pack_init_N, SimdExt> &from_packx,
          const pack<T, 1, SimdExt> &from_pack) {
    (void)from_packx;
    pack<T, 1, SimdExt> to_pack_;
    to_pack_.car = from_pack.car; // simd_vector
    return to_pack_;
  }
};

// Switch: from_packx[i] --> from_packx[i+1]
// Switch condition: from_pack_unroll_ix == 1 && to_pack_unroll_ix > 1
template <NSIMD_CONCEPT_VALUE_TYPE T, int from_pack_init_N, int to_pack_unroll_ix,
          int which_from_pack_ix, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class packx>
struct to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_ix */,
                             to_pack_unroll_ix, which_from_pack_ix, SimdExt,
                             packx> {
  static pack<T, to_pack_unroll_ix, SimdExt>
  to_pack(const packx<T, from_pack_init_N, SimdExt> &from_packx,
          const pack<T, 1, SimdExt> &from_pack) {

    pack<T, to_pack_unroll_ix, SimdExt> to_pack_;
    to_pack_.car = from_pack.car; // simd_vector

    // get next pack
    to_pack_.cdr = to_pack_recurs_helper<
        T, from_pack_init_N, from_pack_init_N, to_pack_unroll_ix - 1,
        which_from_pack_ix + 1, SimdExt,
        packx>::to_pack(from_packx,
                        get_pack<which_from_pack_ix + 1>(from_packx));
    return to_pack_;
  }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,
          template <typename, int, typename> class packx>
typename to_pack_trait<packx<T, N, SimdExt> >::value_type
to_pack(const packx<T, N, SimdExt> &from_packx) {
  static const int to_pack_unroll_ix = packx<T, N, SimdExt>::soa_num_packs * N;
  pack<T, to_pack_unroll_ix, SimdExt> to_pack_;
  to_pack_.car = from_packx.v0.car; // simd_vector
  to_pack_.cdr = to_pack_recurs_helper<
      T, N /* from_pack_init_N*/, N - 1 /* from_pack_unroll_ix */,
      to_pack_unroll_ix - 1 /* to_pack_unroll_ix */,
      0 /* which_from_pack_ix */, SimdExt, packx>::to_pack(from_packx,
                                                           from_packx.v0.cdr);
  return to_pack_;
}

// ----------------------------------------------------------------------------
// to_pack_interleave

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> to_pack_interleave(const pack<T, 1, SimdExt> &pack_) {
  return pack_;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> to_pack_interleave(const pack<T, N, SimdExt> &pack_) {
  return pack_;
}

// ----------------------------------------------------------------------------

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 1, SimdExt> to_pack_interleave(const packx1<T, 1, SimdExt> &packx1_) {
  pack<T, 1, SimdExt> pack_1;
  pack_1.car = packx1_.v0.car;
  pack_1.cdr = packx1_.v0.cdr;
  return pack_1;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt>
to_pack_interleave(const packx1<T, N, SimdExt> &packx1_N) {
  pack<T, N, SimdExt> pack_1;
  pack_1.car = packx1_N.v0.car;
  pack_1.cdr = packx1_N.v0.cdr;
  return pack_1;
}

// ----------------------------------------------------------------------------

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2, SimdExt> to_pack_interleave(const packx2<T, 1, SimdExt> &packx2_) {

  nsimd::pack<T, 2, SimdExt> pack_2;
  pack_2.car = packx2_.v0.car;
  pack_2.cdr.car = packx2_.v1.car;

  return pack_2;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2 * N, SimdExt>
to_pack_interleave(const packx2<T, N, SimdExt> &packx2_N) {

  pack<T, 2 * N, SimdExt> pack_2xN;
  pack_2xN.car = packx2_N.v0.car;
  pack_2xN.cdr.car = packx2_N.v1.car;

  packx2<T, N - 1, SimdExt> packx2_n_1;
  packx2_n_1.v0 = packx2_N.v0.cdr;
  packx2_n_1.v1 = packx2_N.v1.cdr;

  pack_2xN.cdr.cdr = to_pack_interleave(packx2_n_1);

  return pack_2xN;
}

// ----------------------------------------------------------------------------

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3, SimdExt> to_pack_interleave(const packx3<T, 1, SimdExt> &packx3_) {

  nsimd::pack<T, 3, SimdExt> pack_3;
  pack_3.car = packx3_.v0.car;
  pack_3.cdr.car = packx3_.v1.car;
  pack_3.cdr.cdr.car = packx3_.v2.car;

  return pack_3;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3 * N, SimdExt>
to_pack_interleave(const packx3<T, N, SimdExt> &packx3_n) {

  pack<T, 3 * N, SimdExt> pack_3xn;
  pack_3xn.car = packx3_n.v0.car;
  pack_3xn.cdr.car = packx3_n.v1.car;
  pack_3xn.cdr.cdr.car = packx3_n.v2.car;

  packx3<T, N - 1, SimdExt> packx3_n_1;
  packx3_n_1.v0 = packx3_n.v0.cdr;
  packx3_n_1.v1 = packx3_n.v1.cdr;
  packx3_n_1.v2 = packx3_n.v2.cdr;

  pack_3xn.cdr.cdr.cdr = to_pack_interleave(packx3_n_1);

  return pack_3xn;
}

// ----------------------------------------------------------------------------

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4, SimdExt> to_pack_interleave(const packx4<T, 1, SimdExt> &packx4_) {

  nsimd::pack<T, 4, SimdExt> pack_4;
  pack_4.car = packx4_.v0.car;
  pack_4.cdr.car = packx4_.v1.car;
  pack_4.cdr.cdr.car = packx4_.v2.car;
  pack_4.cdr.cdr.cdr.car = packx4_.v3.car;

  return pack_4;
}

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4 * N, SimdExt>
to_pack_interleave(const packx4<T, N, SimdExt> &packx4_n) {

  pack<T, 4 * N, SimdExt> pack_4xn;
  pack_4xn.car = packx4_n.v0.car;
  pack_4xn.cdr.car = packx4_n.v1.car;
  pack_4xn.cdr.cdr.car = packx4_n.v2.car;
  pack_4xn.cdr.cdr.cdr.car = packx4_n.v3.car;

  packx4<T, N - 1, SimdExt> packx4_n_1;
  packx4_n_1.v0 = packx4_n.v0.cdr;
  packx4_n_1.v1 = packx4_n.v1.cdr;
  packx4_n_1.v2 = packx4_n.v2.cdr;
  packx4_n_1.v3 = packx4_n.v3.cdr;

  pack_4xn.cdr.cdr.cdr.cdr = to_pack_interleave(packx4_n_1);

  return pack_4xn;
}

} // namespace nsimd

#endif


================================================
FILE: include/nsimd/cxx_adv_api_aliases.hpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_CXX_ADV_API_ALIASES_HPP
#define NSIMD_CXX_ADV_API_ALIASES_HPP

#include <nsimd/cxx_adv_api.hpp>

namespace nsimd {

/* ------------------------------------------------------------------------- */

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> fabs(pack<T, N, SimdExt> const &a0) {
  return abs(a0);
}

/* ------------------------------------------------------------------------- */

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> fmin(pack<T, N, SimdExt> const &a0,
                         pack<T, N, SimdExt> const &a1) {
  return min(a0, a1);
}

/* ------------------------------------------------------------------------- */

template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, N, SimdExt> fmax(pack<T, N, SimdExt> const &a0,
                         pack<T, N, SimdExt> const &a1) {
  return max(a0, a1);
}

/* ------------------------------------------------------------------------- */

} // namespace nsimd

#endif


================================================
FILE: include/nsimd/modules/fixed_point.hpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_MODULES_FIXED_POINT_HPP
#define NSIMD_MODULES_FIXED_POINT_HPP

#include <nsimd/nsimd.h>

#include "nsimd/modules/fixed_point/fixed.hpp"
#include "nsimd/modules/fixed_point/simd.hpp"
#include "nsimd/modules/fixed_point/simd_math.hpp"

namespace nsimd {
namespace fixed_point {

// -----------------------------------------------------------------------------
// ------------------------ Types definitions and len --------------------------
// -----------------------------------------------------------------------------

template <typename T> NSIMD_STRUCT pack;

template <typename T> int len(const T &) { return fpsimd_n(T()); }

template <typename T> int len(const nsimd::fixed_point::pack<T> &) {
  return fpsimd_n(fpsimd_t<T::lf, T::rt>());
}

template <typename T> NSIMD_STRUCT pack {
  static const u8 lf = T::lf;
  static const u8 rt = T::rt;
  typedef fp_t<lf, rt> value_type;
  fpsimd_t<lf, rt> val;

  friend std::ostream &operator<<(std::ostream &os, pack<T> &a0) {
    T *buf = new T[nsimd::fixed_point::len(a0)];
    nsimd::fixed_point::simd_storeu( buf , a0.val );
    os << "{ ";
    int n = nsimd::fixed_point::len(a0);
    for (int i = 0; i < n; i++) {
      os << buf[i];
      if (i < n - 1) {
        os << ", ";
      }
    }
    os << " }";
    delete[] buf;
    return os;
  }
};

template <typename T> NSIMD_STRUCT packl {
  static const u8 lf = T::lf;
  static const u8 rt = T::rt;
  typedef typename fp_t<lf, rt>::logical_type value_type;
  fpsimdl_t<lf, rt> val;
};

// -----------------------------------------------------------------------------
// ------------------- Basic arithmetic operators ------------------------------
// -----------------------------------------------------------------------------

template <typename T>
NSIMD_INLINE pack<T> add(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_add<T::lf, T::rt>(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator+(const pack<T> &a0, const pack<T> &a1) {
  return add( a0 , a1 );
}

template <typename T>
NSIMD_INLINE pack<T> sub(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_sub<T::lf, T::rt>(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator-(const pack<T> &a0, const pack<T> &a1) {
  return sub( a0 , a1 );
}

template <typename T>
NSIMD_INLINE pack<T> mul(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_mul<T::lf, T::rt>(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator*(const pack<T> &a0, const pack<T> &a1) {
  return mul( a0 , a1 );
}

template <typename T>
NSIMD_INLINE pack<T> div(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_div<T::lf, T::rt>(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator/(const pack<T> &a0, const pack<T> &a1) {
  return div( a0 , a1 );
}

template <typename T>
NSIMD_INLINE pack<T> fma(const pack<T> &a0, const pack<T> &a1,
                         const pack<T> &a2) {
  pack<T> res;
  res.val = simd_fma<T::lf, T::rt>(a0.val, a1.val, a2.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> min(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_min(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> max(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_max(a0.val, a1.val);
  return res;
}

// -----------------------------------------------------------------------------
// ------------------- Comparison operators ------------------------------------
// -----------------------------------------------------------------------------

template <typename T>
NSIMD_INLINE packl<T> eq(const pack<T> &a0, const pack<T> &a1) {
  packl<T> res;
  res.val = simd_eq(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator==(const pack<T> &a0, const pack<T> &a1) {
  return eq( a0 , a1 );
}

template <typename T>
NSIMD_INLINE packl<T> ne(const pack<T> &a0, const pack<T> &a1) {
  packl<T> res;
  res.val = simd_ne(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator!=(const pack<T> &a0, const pack<T> &a1) {
  return ne( a0 , a1 );
}

template <typename T>
NSIMD_INLINE packl<T> le(const pack<T> &a0, const pack<T> &a1) {
  packl<T> res;
  res.val = simd_le(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator<=(const pack<T> &a0, const pack<T> &a1) {
  return le( a0 , a1 );
}

template <typename T>
NSIMD_INLINE packl<T> lt(const pack<T> &a0, const pack<T> &a1) {
  packl<T> res;
  res.val = simd_lt(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator<(const pack<T> &a0, const pack<T> &a1) {
  return lt( a0 , a1 );
}

template <typename T>
NSIMD_INLINE packl<T> ge(const pack<T> &a0, const pack<T> &a1) {
  packl<T> res;
  res.val = simd_ge(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator>=(const pack<T> &a0, const pack<T> &a1) {
  return ge( a0 , a1 );
}

template <typename T>
NSIMD_INLINE packl<T> gt(const pack<T> &a0, const pack<T> &a1) {
  packl<T> res;
  res.val = simd_gt(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> operator>(const pack<T> &a0, const pack<T> &a1) {
  return gt( a0 , a1 );
}

template <typename T>
NSIMD_INLINE pack<T> if_else1(const packl<T> &a0, const pack<T> &a1,
                              const pack<T> &a2) {
  pack<T> res;
  res.val = simd_if_else1(a0.val, a1.val, a2.val);
  return res;
}

// -----------------------------------------------------------------------------
// ------------------- Bitwise operators  --------------------------------------
// -----------------------------------------------------------------------------

template <typename T>
NSIMD_INLINE pack<T> andb(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_andb(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE packl<T> andl(const packl<T> &a0, const packl<T> &a1) {
  packl<T> res;
  res.val = simd_andl(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> andnotb(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_andnotb(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE packl<T> andnotl(const packl<T> &a0, const packl<T> &a1) {
  packl<T> res;
  res.val = simd_andnotl(a0.val, a1.val);
  return res;
}

template <typename T> NSIMD_INLINE pack<T> notb(pack<T> a0) {
  pack<T> res;
  res.val = simd_notb(a0.val);
  return res;
}

template <typename T> NSIMD_INLINE packl<T> notl(packl<T> a0) {
  packl<T> res;
  res.val = simd_notl(a0.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> orb(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_orb(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE packl<T> orl(const packl<T> &a0, const packl<T> &a1) {
  packl<T> res;
  res.val = simd_orl(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE pack<T> xorb(const pack<T> &a0, const pack<T> &a1) {
  pack<T> res;
  res.val = simd_xorb(a0.val, a1.val);
  return res;
}

template <typename T>
NSIMD_INLINE packl<T> xorl(const packl<T> &a0, const packl<T> &a1) {
  packl<T> res;
  res.val = simd_xorl(a0.val, a1.val);
  return res;
}

// -----------------------------------------------------------------------------
// ------------------- Math functions ------------------------------------------
// -----------------------------------------------------------------------------

template <typename T> NSIMD_INLINE pack<T> abs(pack<T> a0) {
  pack<T> res;
  res.val = simd_abs(a0.val);
  return res;
}

template <typename T> NSIMD_INLINE pack<T> rec(pack<T> a0) {
  pack<T> res;
  res.val = simd_rec(a0.val);
  return res;
}

// -----------------------------------------------------------------------------
// -------------------- Load functions -----------------------------------------
// -----------------------------------------------------------------------------

template <typename T> NSIMD_INLINE T set1(typename T::value_type a0) {
  T res;
  res.val = simd_set1<T::lf, T::rt>(a0);
  return res;
}

template <typename T> NSIMD_INLINE T loadu(typename T::value_type *p) {
  T res;
  res.val = simd_loadu<T::lf, T::rt>(p);
  return res;
}

template <typename T> NSIMD_INLINE T loada(typename T::value_type *p) {
  T res;
  res.val = simd_loada<T::lf, T::rt>(p);
  return res;
}

template <typename T> NSIMD_INLINE T loadlu(typename T::value_type *p) {
  T res;
  res.val = simd_loadlu<T::lf, T::rt>(p);
  return res;
}

template <typename T> NSIMD_INLINE T loadla(typename T::value_type *p) {
  T res;
  res.val = simd_loadla<T::lf, T::rt>(p);
  return res;
}

// -----------------------------------------------------------------------------
// -------------------- Store functions ----------------------------------------
// -----------------------------------------------------------------------------

template <typename T>
NSIMD_INLINE void storeu(typename T::value_type *p, T v) {
  simd_storeu<T::lf, T::rt>(p, v.val);
}

template <typename T>
NSIMD_INLINE void storea(typename T::value_type *p, T v) {
  simd_storea<T::lf, T::rt>(p, v.val);
}

template <typename T>
NSIMD_INLINE void storelu(typename T::value_type *p, T v) {
  simd_storelu<T::lf, T::rt>(p, v.val);
}

template <typename T>
NSIMD_INLINE void storela(typename T::value_type *p, T v) {
  simd_storela<T::lf, T::rt>(p, v.val);
}

} // namespace fixed_point

} // namespace nsimd

#endif


================================================
FILE: include/nsimd/modules/memory_management.hpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_MODULES_MEMORY_MANAGEMENT_HPP
#define NSIMD_MODULES_MEMORY_MANAGEMENT_HPP

#include <cstdlib>
#include <cstring>
#include <iostream>
#include <nsimd/nsimd.h>

namespace nsimd {

// ----------------------------------------------------------------------------
// CUDA

#if defined(NSIMD_CUDA)

template <typename T> T *device_malloc(size_t sz) {
  void *ret;
  if (cudaMalloc(&ret, sz * sizeof(T)) != cudaSuccess) {
    return NULL;
  }
  return (T *)ret;
}

template <typename T> T *device_calloc(size_t sz) {
  void *ret;
  if (cudaMalloc(&ret, sz * sizeof(T)) != cudaSuccess) {
    return NULL;
  }
  if (cudaMemset((void *)ret, 0, sz * sizeof(T)) != cudaSuccess) {
    cudaFree(ret);
    return NULL;
  }
  return (T *)ret;
}

template <typename T> void device_free(T *ptr) { cudaFree((void *)ptr); }

template <typename T>
void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {
  cudaMemcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T),
             cudaMemcpyHostToDevice);
}

template <typename T>
void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {
  cudaMemcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T),
             cudaMemcpyDeviceToHost);
}

#define nsimd_fill_dev_mem_func(func_name, expr)                              \
  template <typename T>                                                       \
  __global__ void kernel_##func_name##_(T *ptr, int n) {                      \
    int i = threadIdx.x + blockIdx.x * blockDim.x;                            \
    if (i < n) {                                                              \
      ptr[i] = (T)(expr);                                                     \
    }                                                                         \
  }                                                                           \
                                                                              \
  template <typename T> void func_name(T *ptr, size_t sz) {                   \
    kernel_##func_name##_<<<(unsigned int)((sz + 127) / 128), 128>>>(         \
        ptr, int(sz));                                                        \
  }

// ----------------------------------------------------------------------------
// ROCm

#elif defined(NSIMD_ROCM)

template <typename T> T *device_malloc(size_t sz) {
  void *ret;
  if (hipMalloc(&ret, sz * sizeof(T)) != hipSuccess) {
    return NULL;
  }
  return (T *)ret;
}

template <typename T> T *device_calloc(size_t sz) {
  void *ret;
  if (hipMalloc(&ret, sz * sizeof(T)) != hipSuccess) {
    return NULL;
  }
  if (hipMemset((void *)ret, 0, sz * sizeof(T)) != hipSuccess) {
    hipFree(ret);
    return NULL;
  }
  return (T *)ret;
}

template <typename T> void device_free(T *ptr) { hipFree((void *)ptr); }

template <typename T>
void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {
  hipMemcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T),
            hipMemcpyHostToDevice);
}

template <typename T> void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {
  hipMemcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T),
            hipMemcpyDeviceToHost);
}

#define nsimd_fill_dev_mem_func(func_name, expr)                              \
  template <typename T>                                                       \
  __global__ void kernel_##func_name##_(T *ptr, size_t n) {                   \
    size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;                \
    if (i < n) {                                                              \
      ptr[i] = (T)(expr);                                                     \
    }                                                                         \
  }                                                                           \
                                                                              \
  template <typename T> void func_name(T *ptr, size_t sz) {                   \
    hipLaunchKernelGGL((kernel_##func_name##_<T>),                            \
                       (size_t)((sz + 127) / 128), 128, 0, NULL, ptr,         \
                       (size_t)sz);                                           \
  }

// ----------------------------------------------------------------------------
// oneAPI

#elif defined(NSIMD_ONEAPI)

template <typename T> T *device_malloc(const size_t sz) {
  return sycl::malloc_device<T>(sz, nsimd::oneapi::default_queue());
}

template <typename T> T *device_calloc(const size_t sz) {
  sycl::queue q = nsimd::oneapi::default_queue();
  T *const ret = sycl::malloc_device<T>(sz, q);
  if (ret == NULL) {
    return NULL;
  }
  q.memset((void *)ret, 0, sz * sizeof(T)).wait_and_throw();
  return ret;
}

template <typename T> void device_free(T *const ptr) {
  sycl::queue q = nsimd::oneapi::default_queue();
  sycl::free(ptr, q);
}

template <typename T>
void copy_to_device(T *const device_ptr, const T *const host_ptr,
                    const size_t sz) {
  sycl::queue q = nsimd::oneapi::default_queue();
  q.memcpy((void *)device_ptr, (const void *)host_ptr, sz * sizeof(T))
      .wait_and_throw();
}

template <typename T>
void copy_to_host(T *const host_ptr, const T *const device_ptr, size_t sz) {
  sycl::queue q = nsimd::oneapi::default_queue();
  q.memcpy((void *)host_ptr, (const void *)device_ptr, sz * sizeof(T))
      .wait_and_throw();
}

#define nsimd_fill_dev_mem_func(func_name, expr)                              \
  template <typename T>                                                       \
  void kernel_##func_name##_(T *const ptr, const size_t sz,                   \
                             sycl::nd_item<1> item) {                         \
    const size_t i = item.get_global_id().get(0);                             \
    if (i < sz) {                                                             \
      ptr[i] = nsimd::to<T>(expr);                                            \
    }                                                                         \
  }                                                                           \
                                                                              \
  template <typename T> void func_name(T *const ptr, const size_t sz) {       \
    const size_t total_num_threads =                                          \
        nsimd::compute_total_num_threads(sz, THREADS_PER_BLOCK);              \
    sycl::queue q = nsimd::oneapi::default_queue();                           \
    q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),       \
                                     sycl::range<1>(THREADS_PER_BLOCK)),      \
                   [=](sycl::nd_item<1> item) {                               \
                     kernel_##func_name##_(ptr, sz, item);                    \
                   })                                                         \
        .wait_and_throw();                                                    \
  }

// ----------------------------------------------------------------------------
// CPU

#else

template <typename T> T *device_malloc(size_t sz) {
  return (T *)malloc(sz * sizeof(T));
}

template <typename T> T *device_calloc(size_t sz) {
  return (T *)calloc(sz * sizeof(T), 1);
}

template <typename T> void device_free(T *ptr) { free((void *)ptr); }

template <typename T>
void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {
  memcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T));
}

template <typename T>
void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {
  memcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T));
}

#define nsimd_fill_dev_mem_func(func_name, expr)                              \
  template <typename T> void func_name(T *ptr, size_t sz) {                   \
    for (size_t i = 0; i < sz; i++) {                                         \
      ptr[i] = nsimd::to<T>(expr);                                            \
    }                                                                         \
  }

#endif

// ----------------------------------------------------------------------------
// Pair of pointers

template <typename T>
struct paired_pointers_t {
  T *device_ptr, *host_ptr;
  size_t sz;
};

template <typename T> paired_pointers_t<T> pair_malloc(size_t sz) {
  paired_pointers_t<T> ret;
  ret.sz = 0;
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
  ret.device_ptr = device_malloc<T>(sz);
  if (ret.device_ptr == NULL) {
    ret.host_ptr = NULL;
    return ret;
  }
  ret.host_ptr = (T *)malloc(sz);
  if (ret.host_ptr == NULL) {
    device_free(ret.device_ptr);
    ret.device_ptr = NULL;
    return ret;
  }
#else
  ret.device_ptr = device_malloc<T>(sz);
  ret.host_ptr = ret.device_ptr;
#endif
  ret.sz = sz;
  return ret;
}

template <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t sz) {
  paired_pointers_t<T> ret = pair_malloc<T>(sz);
  if (ret.device_ptr == NULL) {
    std::cerr << __FILE__ << ":" << __LINE__ << ": error cannot malloc " << sz
              << " bytes" << std::endl;
    exit(EXIT_FAILURE);
  }
  return ret;
}

template <typename T> paired_pointers_t<T> pair_calloc(size_t sz) {
  paired_pointers_t<T> ret;
  ret.sz = 0;
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
  ret.device_ptr = device_calloc<T>(sz);
  if (ret.device_ptr == NULL) {
    ret.host_ptr = NULL;
    return ret;
  }
  ret.host_ptr = calloc(sz, 1);
  if (ret.host_ptr == NULL) {
    device_free(ret.device_ptr);
    ret.device_ptr = NULL;
    return ret;
  }
#else
  ret.device_ptr = device_calloc<T>(sz);
  ret.host_ptr = ret.device_ptr;
#endif
  ret.sz = sz;
  return ret;
}

template <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t sz) {
  paired_pointers_t<T> ret = pair_calloc<T>(sz);
  if (ret.device_ptr == NULL) {
    std::cerr << __FILE__ << ":" << __LINE__ << ": error cannot calloc " << sz
              << " bytes" << std::endl;
    exit(EXIT_FAILURE);
  }
  return ret;
}

template <typename T> void pair_free(paired_pointers_t<T> p) {
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
  device_free(p.device_free);
  free((void *)p.host_ptr);
#else
  free((void *)p.host_ptr);
#endif
}

template <typename T> void copy_to_device(paired_pointers_t<T> p) {
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
  copy_to_device(p.device_ptr, p.host_ptr, p.sz);
#else
  (void)p;
#endif
}

template <typename T> void copy_to_host(paired_pointers_t<T> p) {
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
  copy_to_host(p.host_ptr, p.device_ptr, p.sz);
#else
  (void)p;
#endif
}

} // namespace nsimd

#endif


================================================
FILE: include/nsimd/modules/spmd.hpp
================================================
/*

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_MODULES_SPMD_HPP
#define NSIMD_MODULES_SPMD_HPP

#include <nsimd/nsimd-all.hpp>

#include <cassert>
#include <vector>
#include <cstring>

namespace spmd {

#if NSIMD_CXX < 2011 || NSIMD_C < 1999
  #define NSIMD_VARIADIC_MACROS_IS_EXTENSION
#endif

#ifdef NSIMD_VARIADIC_MACROS_IS_EXTENSION
  #if defined(NSIMD_IS_GCC)
    /* Not emitting the warning -Wvariadic-macros is not possible with
       GCC <= 12. It is a bug. A workaround is to tell GCC to consider this
       header file as a system header file so that all warnings are not
       emitted. This is not satisfying but necessary for the moment.
       */
    #pragma GCC system_header
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wvariadic-macros"
  #elif defined(NSIMD_IS_CLANG)
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wvariadic-macros"
  #endif
#endif

// ----------------------------------------------------------------------------
// GPUs: CUDA, ROCm or oneAPI

#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)

#if defined(NSIMD_CUDA)

// 1d kernel definition
#define spmd_kernel_1d(name, ...)                                             \
  template <int spmd_ScalarBits_> __global__ void name(__VA_ARGS__, int n) {  \
    int spmd_i_ = threadIdx.x + blockIdx.x * blockDim.x;                      \
    if (spmd_i_ < n) {

// templated kernel definition
#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \
  template <typename template_argument, int spmd_ScalarBits_>                 \
  __global__ void name(__VA_ARGS__, int n) {                                  \
    int spmd_i_ = threadIdx.x + blockIdx.x * blockDim.x;                      \
    if (spmd_i_ < n) {

#elif defined(NSIMD_ROCM)

// 1d kernel definition
#define spmd_kernel_1d(name, ...)                                             \
  template <int spmd_ScalarBits_>                                             \
  __global__ void name(__VA_ARGS__, size_t n) {                               \
    size_t spmd_i_ = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;          \
    if (spmd_i_ < n) {

// templated kernel definition
#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \
  template <typename template_argument, int spmd_ScalarBits_>                 \
  __global__ void name(__VA_ARGS__, size_t n) {                               \
    size_t spmd_i_ = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;          \
    if (spmd_i_ < n) {

#else

// 1d kernel definition
#define spmd_kernel_1d(name, ...)                                             \
  template <int spmd_ScalarBits_>                                             \
  inline void name(__VA_ARGS__, const size_t n, sycl::nd_item<1> item) {      \
    size_t spmd_i_ = item.get_global_id().get(0);                             \
    if (spmd_i_ < n) {

// templated kernel definition
#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \
  template <typename template_argument, int spmd_ScalarBits_>                 \
  inline void name(__VA_ARGS__, const size_t n, sycl::nd_item<1> item) {      \
    size_t spmd_i_ = item.get_global_id().get(0);                             \
    if (spmd_i_ < n) {

#endif

#define spmd_kernel_end                                                       \
  }                                                                           \
  }

#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)

// device function
#define spmd_dev_func(type_name, ...)                                         \
  template <int spmd_ScalarBits_> __device__ type_name(__VA_ARGS__) {

// templated device function
#define spmd_tmpl_dev_func(type_name, template_argument, ...)                 \
  template <typename template_argument, int spmd_ScalarBits_>                 \
  __device__ type_name(__VA_ARGS__) {

#else

// device function
#define spmd_dev_func(type_name, ...)                                         \
  template <int spmd_ScalarBits_> type_name(__VA_ARGS__) {

// templated device function
#define spmd_tmpl_dev_func(type_name, template_argument, ...)                 \
  template <typename template_argument, int spmd_ScalarBits_>                 \
  type_name(__VA_ARGS__) {

#endif

#define spmd_dev_func_end }

// call spmd_dev_function
#define spmd_call_dev_func(name, ...) name<spmd_ScalarBits_>(__VA_ARGS__)

// call templated spmd_dev_function
#define spmd_call_tmpl_dev_func(name, template_argument, ...)                 \
  name<template_argument, spmd_ScalarBits_>(__VA_ARGS__)

#if defined(NSIMD_CUDA)

// launch 1d kernel CUDA
#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n,  \
                              ...)                                            \
  name<spmd_scalar_bits_>                                                     \
      <<<(unsigned int)nsimd_kernel_param(n, threads_per_block),              \
         (unsigned int)(threads_per_block)>>>(__VA_ARGS__, (int)n)

#elif defined(NSIMD_ROCM)

// launch 1d kernel ROCm
#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n,  \
                              ...)                                            \
  hipLaunchKernelGGL((name<spmd_scalar_bits_>),                               \
                     (size_t)nsimd_kernel_param(n, threads_per_block),        \
                     (size_t)(threads_per_block), 0, NULL, __VA_ARGS__,       \
                     (size_t)n)

#else

// launch 1d kernel oneAPI
#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n,  \
                              ...)                                            \
  size_t total_num_threads =                                                  \
      (size_t)nsimd_kernel_param(n, threads_per_block);                       \
  sycl::queue q = nsimd::oneapi::default_queue();                             \
  q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),         \
                                   sycl::range<1>(threads_per_block)),        \
                 [=](sycl::nd_item<1> item) {                                 \
                   name<spmd_scalar_bits_>(__VA_ARGS__, (size_t)n, item);     \
                 })                                                           \
      .wait_and_throw();

#endif

// supported types (generic)
template <int ScalarBits> struct type_t {};

// supported types (scalar)
template <> struct type_t<8> {
  typedef i8 itype;
  typedef u8 utype;
  typedef bool btype;
};

template <> struct type_t<16> {
  typedef i16 itype;
  typedef u16 utype;
  typedef f16 ftype;
  typedef bool btype;
};

template <> struct type_t<32> {
  typedef i32 itype;
  typedef u32 utype;
  typedef f32 ftype;
  typedef bool btype;
};

template <> struct type_t<64> {
  typedef i64 itype;
  typedef u64 utype;
  typedef f64 ftype;
  typedef bool btype;
};

// supported types (generic)
#define k_int typename spmd::type_t<spmd_ScalarBits_>::itype
#define k_uint typename spmd::type_t<spmd_ScalarBits_>::utype
#define k_float typename spmd::type_t<spmd_ScalarBits_>::ftype
#define k_bool typename spmd::type_t<spmd_ScalarBits_>::btype

// loads and stores (generic)
#define k_store(base_addr, value)                                             \
  do {                                                                        \
    base_addr[spmd_i_] = value;                                               \
  } while (0)

#define k_unmasked_store(base_addr, value) k_store(base_addr, value)
#define k_load(base_addr) base_addr[spmd_i_]
#define k_unmasked_load(base_addr) k_load(base_addr)

// f32 <--> f16 conversions
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)
#define k_f32_to_f16(a) __float2half(a)
#define k_f16_to_f32(a) __half2float(a)
#else
#define k_f32_to_f16(a) f16(a)
#define k_f16_to_f32(a) static_cast<f32>(a)
#endif

// assignment statement
#define k_set(var, value)                                                     \
  do {                                                                        \
    var = value;                                                              \
  } while (0)

#define k_unmasked_set(var, value) k_set(var, value)

// while statement (k_while)
#define k_while(cond) while (cond) {
#define k_endwhile }

// break statement (k_break)
#define k_break break

// continue statement (k_continue)
#define k_continue continue

// endwhile statement (k_endwhile)
#define k_endwhile }

// if statement (k_if)
#define k_if(cond) if (cond) {

// elseif statement (k_elseif)
#define k_elseif(cond)                                                        \
  }                                                                           \
  else if (cond) {

// else statement (k_else)
#define k_else                                                                \
  }                                                                           \
  else {

// endif statement (k_endif)
#define k_endif }

// ----------------------------------------------------------------------------
// SIMD and SCALAR: dispatch between the two is done on a type

#else

// helpers
template <typename T, int N> nsimd::pack<T, N> to_pack(T a) {
  return nsimd::pack<T, N>(a);
}

template <typename T, int N, typename SimdExt>
nsimd::pack<T, N, SimdExt> to_pack(nsimd::pack<T, N, SimdExt> const &a) {
  return a;
}

template <typename T, int N> nsimd::packl<T, N> to_packl(bool a) {
  return nsimd::packl<T, N>(int(a));
}

template <typename T, int N, typename Pack>
nsimd::packl<T, N> to_packl(Pack const &a) {
  return nsimd::reinterpretl<nsimd::packl<T, N> >(a);
}

template <typename T> struct base_type { typedef T type; };

template <typename T, int N, typename SimdExt>
struct base_type<nsimd::pack<T, N, SimdExt> > {
  typedef T type;
};

template <typename T, int N, typename SimdExt>
struct base_type<nsimd::packl<T, N, SimdExt> > {
  typedef T type;
};

// type indicating SIMD or scalar kernel
struct KernelScalar {};
struct KernelSIMD {};

// common to all function: mainly to avoid warnings
#define spmd_func_begin_                                                      \
  (void)spmd_i_;                                                              \
  (void)spmd_mask_;                                                           \
  k_bool spmd_off_lanes_return_(false);                                       \
  (void)spmd_off_lanes_return_;                                               \
  k_bool spmd_off_lanes_break_(false);                                        \
  (void)spmd_off_lanes_break_;                                                \
  k_bool spmd_off_lanes_continue_(false);                                     \
  (void)spmd_off_lanes_continue_;

// 1d kernel definition
#define spmd_kernel_1d(name, ...)                                             \
  template <typename spmd_KernelType_, int spmd_ScalarBits_, int spmd_N_,     \
            typename spmd_MaskType_>                                          \
  void name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \
    spmd_func_begin_

// templated kernel definition
#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \
  template <typename template_argument, typename spmd_KernelType_,            \
            int spmd_ScalarBits_, int spmd_N_, typename spmd_MaskType_>       \
  void name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \
    spmd_func_begin_

#define spmd_kernel_end }

// device function
#define spmd_dev_func(type_name, ...)                                         \
  template <typename spmd_KernelType_, int spmd_ScalarBits_, int spmd_N_,     \
            typename spmd_MaskType_>                                          \
  type_name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \
    spmd_func_begin_

// templated device function
#define spmd_tmpl_dev_func(type_name, template_argument, ...)                 \
  template <typename template_argument, typename spmd_KernelType_,            \
            int spmd_ScalarBits_, int spmd_N_, typename spmd_MaskType_>       \
  type_name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \
    spmd_func_begin_

#define spmd_dev_func_end }

// call spmd_dev_function
#define spmd_call_dev_func(name, ...)                                         \
  name<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(spmd_i_, spmd_mask_,      \
                                                    __VA_ARGS__)

// call templated spmd_dev_function
#define spmd_call_tmpl_dev_func(name, template_argument, ...)                 \
  name<template_argument, spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(       \
      spmd_i_, spmd_mask_, __VA_ARGS__)

// launch 1d kernel
#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, spmd_unroll_, spmd_n_, \
                              ...)                                            \
  {                                                                           \
    spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_, spmd_unroll_>::btype    \
        spmd_mask_(true);                                                     \
    nsimd_nat spmd_i_;                                                        \
    nsimd_nat len =                                                           \
        nsimd::len(spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_,          \
                                spmd_unroll_>::itype());                      \
    for (spmd_i_ = 0; spmd_i_ + len <= spmd_n_; spmd_i_ += len) {             \
      name<spmd::KernelSIMD, spmd_scalar_bits_, spmd_unroll_>(                \
          spmd_i_, spmd_mask_, __VA_ARGS__);                                  \
    }                                                                         \
    for (; spmd_i_ < spmd_n_; spmd_i_++) {                                    \
      name<spmd::KernelScalar, spmd_scalar_bits_, spmd_unroll_>(              \
          spmd_i_, true, __VA_ARGS__);                                        \
    }                                                                         \
  }

// launch 1d templated kernel
#define spmd_launch_tmpl_kernel_1d(                                      \
    name, template_argument, spmd_scalar_bits_, spmd_unroll_, spmd_n_, ...)   \
  {                                                                           \
    typename spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_,                \
                          spmd_unroll_>::btype spmd_mask_(true);              \
    nsimd_nat spmd_i_;                                                        \
    nsimd_nat len =                                                           \
        nsimd::len(typename spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_, \
                                         spmd_unroll_>::itype());             \
    for (spmd_i_ = 0; spmd_i_ + len <= spmd_n_; spmd_i_ += len) {             \
      name<template_argument, spmd::KernelSIMD, spmd_scalar_bits_,            \
           spmd_unroll_>(spmd_i_, spmd_mask_, __VA_ARGS__);                   \
    }                                                                         \
    for (; spmd_i_ < spmd_n_; spmd_i_++) {                                    \
      name<template_argument, spmd::KernelScalar, spmd_scalar_bits_,          \
           spmd_unroll_>(spmd_i_, true, __VA_ARGS__);                         \
    }                                                                         \
  }

// supported types (generic)
template <typename KernelType, int ScalarBits, int N> struct type_t {};

// supported types (scalar)
template <int N> struct type_t<KernelScalar, 8, N> {
  typedef i8 itype;
  typedef u8 utype;
  typedef bool btype;
};

template <int N> struct type_t<KernelScalar, 16, N> {
  typedef i16 itype;
  typedef u16 utype;
  typedef f16 ftype;
  typedef bool btype;
};

template <int N> struct type_t<KernelScalar, 32, N> {
  typedef i32 itype;
  typedef u32 utype;
  typedef f32 ftype;
  typedef bool btype;
};

template <int N> struct type_t<KernelScalar, 64, N> {
  typedef i64 itype;
  typedef u64 utype;
  typedef f64 ftype;
  typedef bool btype;
};

// supported types (SIMD)
template <int N> struct type_t<KernelSIMD, 8, N> {
  typedef nsimd::pack<i8, N> itype;
  typedef nsimd::pack<u8, N> utype;
  typedef nsimd::packl<i8, N> btype;
};

template <int N> struct type_t<KernelSIMD, 16, N> {
  typedef nsimd::pack<i16, N> itype;
  typedef nsimd::pack<u16, N> utype;
  typedef nsimd::pack<f16, N> ftype;
  typedef nsimd::packl<i16, N> btype;
};

template <int N> struct type_t<KernelSIMD, 32, N> {
  typedef nsimd::pack<i32, N> itype;
  typedef nsimd::pack<u32, N> utype;
  typedef nsimd::pack<f32, N> ftype;
  typedef nsimd::packl<i32, N> btype;
};

template <int N> struct type_t<KernelSIMD, 64, N> {
  typedef nsimd::pack<i64, N> itype;
  typedef nsimd::pack<u64, N> utype;
  typedef nsimd::pack<f64, N> ftype;
  typedef nsimd::packl<i64, N> btype;
};

// supported types (generic)
#define k_int                                                                 \
  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::itype
#define k_uint                                                                \
  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::utype
#define k_float                                                               \
  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::ftype
#define k_bool                                                                \
  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::btype

// loads and stores (generic)
template <typename KernelType> struct store_helper {};
template <typename KernelType> struct load_helper {};
#define k_store(base_addr, value)                                             \
  spmd::store_helper<spmd_KernelType_>::impl(spmd_mask_, &base_addr[spmd_i_], \
                                             value)
#define k_unmasked_store(base_addr, value)                                    \
  spmd::store_helper<spmd_KernelType_>::unmasked_impl(&base_addr[spmd_i_],    \
                                                      value)

#define k_load(base_addr)                                                     \
  spmd::load_helper<spmd_KernelType_>::impl(spmd_mask_, &base_addr[spmd_i_])
#define k_unmasked_load(base_addr)                                            \
  spmd::load_helper<spmd_KernelType_>::template unmasked_impl<spmd_N_>(       \
      &base_addr[spmd_i_])

// loads and stores (scalar)
template <> struct store_helper<KernelScalar> {
  template <typename T, typename S>
  static void impl(bool mask, T *addr, S value) {
    if (mask) {
      *addr = nsimd::to<T>(value);
    }
  }

  template <typename T, typename S>
  static void unmasked_impl(T *addr, S value) {
    *addr = nsimd::to<T>(value);
  }
};

template <> struct load_helper<KernelScalar> {
  template <typename T> static T impl(bool mask, T *addr) {
    if (mask) {
      return *addr;
    } else {
      return nsimd::to<T>(0);
    }
  }

  template <int N, typename T> static T unmasked_impl(T *addr) {
    return *addr;
  }
};

template <> struct store_helper<KernelSIMD> {
  template <typename T, typename S, int N, typename SimdExt>
  static void impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr,
                   nsimd::pack<S, N, SimdExt> const &value) {
    nsimd::mask_storeu(mask, addr, value);
  }

  template <typename T, typename S, typename U, int N, typename SimdExt>
  static void impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr,
                   U value) {
    nsimd::mask_storeu(mask, addr,
                       nsimd::pack<S, N, SimdExt>(nsimd::to<S>(value)));
  }

  template <typename T, int N, typename SimdExt>
  static void unmasked_impl(T *addr, nsimd::pack<T, N, SimdExt> const &value) {
    nsimd::storeu(addr, value);
  }

  template <typename T, typename S, int N, typename SimdExt>
  static void unmasked_impl(T *addr, S value) {
    nsimd::storeu(addr, nsimd::pack<T, N, SimdExt>(nsimd::to<T>(value)));
  }
};

template <> struct load_helper<KernelSIMD> {
  template <typename T, typename S, int N, typename SimdExt>
  static nsimd::pack<S, N, SimdExt>
  impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr) {
    return nsimd::maskz_loadu(mask, addr);
  }

  template <int N, typename T>
  static nsimd::pack<T, N> unmasked_impl(T *addr) {
    return nsimd::loadu<nsimd::pack<T, N> >(addr);
  }
};

// f32 <--> f16 conversions
#define k_f32_to_f16(a) nsimd_f32_to_f16(a)
#define k_f16_to_f32(a) nsimd_f16_to_f32(a)

// Clear lanes
template <typename T, typename S, int N, typename SimdExt>
nsimd::packl<T, N, SimdExt>
clear_lanes(nsimd::packl<T, N, SimdExt> const &mask,
            nsimd::packl<S, N, SimdExt> const &lanes) {
  return nsimd::andnotl(mask, lanes);
}

inline bool clear_lanes(bool mask, bool lanes) { return lanes ? false : mask; }

// assignment statement
template <typename T, typename S> void k_set_(bool mask, T &var, S value) {
  if (mask) {
    var = nsimd::to<T>(value);
  }
}

template <typename T, typename S, int N, typename SimdExt, typename U>
void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
            nsimd::pack<S, N, SimdExt> &var, U value) {
  var = nsimd::if_else(mask, nsimd::pack<S, N, SimdExt>(S(value)), var);
}

template <typename T, typename S, int N, typename SimdExt>
void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
            nsimd::pack<S, N, SimdExt> &var,
            nsimd::pack<S, N, SimdExt> const &value) {
  var = nsimd::if_else(mask, value, var);
}

template <typename T, typename S, int N, typename SimdExt, typename U>
void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
            nsimd::packl<S, N, SimdExt> &var, U value) {
  var = nsimd::reinterpretl<nsimd::packl<S, N, SimdExt> >(
      mask && nsimd::pack<S, N, SimdExt>(int(value)));
}

template <typename T, typename S, int N, typename SimdExt, typename U>
void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
            nsimd::packl<S, N, SimdExt> &var,
            nsimd::packl<U, N, SimdExt> const &value) {
  var = nsimd::reinterpretl<nsimd::packl<S, N, SimdExt> >(mask && value);
}

#define k_set(var, value) spmd::k_set_(spmd_mask_, var, value)
#define k_unmasked_set(var, value)                                            \
  do {                                                                        \
    var = value;                                                              \
  } while (0)

template <typename T, int N, typename SimdExt>
bool any(nsimd::packl<T, N, SimdExt> const a) {
  return nsimd::any(a);
}

template <typename KernelType, int ScalarBits, int N, typename Packl>
typename type_t<KernelType, ScalarBits, N>::btype to_k_bool_(Packl const &a) {
  return nsimd::reinterpretl<
      typename type_t<KernelType, ScalarBits, N>::btype>(a);
}

template <typename KernelType, int ScalarBits, int N>
inline bool to_k_bool_(bool a) {
  return a;
}

#define k_to_bool(a)                                                          \
  spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(a)

inline bool any(bool a) { return a; }

// while statement (k_while)
#define k_while(cond)                                                         \
  {                                                                           \
    k_bool spmd_middle_mask_ = spmd_mask_;                                    \
    k_bool spmd_off_lanes_break_(false);                                      \
    (void)spmd_off_lanes_break_;                                              \
    k_bool spmd_off_lanes_continue_(false);                                   \
    (void)spmd_off_lanes_continue_;                                           \
    {                                                                         \
      while (spmd::any(cond)) {                                               \
        k_bool spmd_cond_ =                                                   \
            spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(    \
                cond);                                                        \
        {                                                                     \
          k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_;                \
          spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);  \
          spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);

// break statement (k_break)
#define k_break                                                               \
  spmd_off_lanes_break_ = spmd_off_lanes_break_ || spmd_mask_;                \
  spmd_mask_ = false;

// continue statement (k_continue)
#define k_continue                                                            \
  spmd_off_lanes_continue_ = spmd_off_lanes_continue_ || spmd_mask_;          \
  spmd_mask_ = false;

// endwhile statement (k_endwhile)
#define k_endwhile                                                            \
  }                                                                           \
  }                                                                           \
  }                                                                           \
  }                                                                           \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);

// return statement (k_return)
#define k_return                                                              \
  spmd_off_lanes_return_ = spmd_off_lanes_return_ || spmd_mask_;              \
  spmd_mask_ = false;

// if statement (k_if)
#define k_if(cond)                                                            \
  {                                                                           \
    k_bool spmd_cond_ =                                                       \
        spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(cond);  \
    k_bool spmd_middle_mask_ = spmd_mask_;                                    \
    {                                                                         \
      k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_;

// elseif statement (k_elseif)
#define k_elseif(cond)                                                        \
  }                                                                           \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);         \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);          \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_);       \
  spmd_middle_mask_ = spmd::clear_lanes(spmd_middle_mask_, spmd_cond_);       \
  spmd_cond_ =                                                                \
      spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(cond);    \
  {                                                                           \
    k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_;

// else statement (k_else)
#define k_else                                                                \
  }                                                                           \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);         \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);          \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_);       \
  spmd_middle_mask_ = spmd::clear_lanes(spmd_middle_mask_, spmd_cond_);       \
  {                                                                           \
    k_bool spmd_mask_ = spmd_middle_mask_;

// endif statement (k_endif)
#define k_endif                                                               \
  }                                                                           \
  }                                                                           \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);         \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);          \
  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_);

// ----------------------------------------------------------------------------

#endif

#ifdef NSIMD_VARIADIC_MACROS_IS_EXTENSION
  #if defined(NSIMD_IS_GCC)
    #pragma GCC diagnostic pop
  #elif defined(NSIMD_IS_CLANG)
    #pragma clang diagnostic pop
  #endif
#endif

} // namespace spmd

#include <nsimd/modules/spmd/functions.hpp>

#endif


================================================
FILE: include/nsimd/modules/tet1d.hpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_MODULES_TET1D_HPP
#define NSIMD_MODULES_TET1D_HPP

#include <nsimd/nsimd-all.hpp>

#include <cassert>
#include <vector>
#include <cstring>

namespace tet1d {

// ----------------------------------------------------------------------------
// general definitions

struct none_t {};

template <typename Op, typename Left, typename Right, typename Extra>
struct node {};

const nsimd::nat end = nsimd::nat(-1);

// ----------------------------------------------------------------------------
// Error management

#if defined(NSIMD_CUDA)
#define nsimd_cuda_assert(ans) tet1d::gpuCheck((ans), __FILE__, __LINE__)
inline void gpuCheck(cudaError_t code, const char *file, int line) {
  if (code != cudaSuccess) {
    fprintf(stderr, "NSIMD Internal error:\n\ttet1d Error: %s %s %d\n",
        cudaGetErrorString(code), file, line);
    exit(code);
  }
}
#endif

// ----------------------------------------------------------------------------
// supported kernels

#if defined(NSIMD_CUDA)

// CUDA component wise kernel
template <typename T, typename Expr>
__global__ void gpu_kernel_component_wise(T *dst, Expr const expr,
                                          nsimd::nat n) {
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  if (i < n) {
    dst[i] = expr.gpu_get(i);
  }
}

// CUDA component wise kernel with masked output
template <typename T, typename Mask, typename Expr>
__global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask,
                                               Expr const expr,
                                               nsimd::nat n) {
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  if (i < n && mask.gpu_get(i)) {
    dst[i] = expr.gpu_get(i);
  }
}

#elif defined(NSIMD_ROCM)

// ROCM component wise kernel
template <typename T, typename Expr>
__global__ void gpu_kernel_component_wise(T *dst, Expr const expr,
                                          nsimd::nat n) {
  int i = int(hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x);
  if (i < n) {
    dst[i] = expr.gpu_get(i);
  }
}

// ROCM component wise kernel with masked output
template <typename T, typename Mask, typename Expr>
__global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask,
                                               Expr const expr,
                                               nsimd::nat n) {
  int i = int(hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x);
  if (i < n && mask.gpu_get(i)) {
    dst[i] = expr.gpu_get(i);
  }
}

#elif defined(NSIMD_ONEAPI)

// oneAPI component wise kernel
template <typename T, typename Expr>
void oneapi_kernel_component_wise(T *dst, Expr const expr,
                                  nsimd::nat n, sycl::nd_item<1> item) {
  const int i = static_cast<int>(item.get_global_id().get(0));
  if (i < n) {
    dst[i] = expr.gpu_get(i);
  }
}

// oneAPI component wise kernel with masked output
template <typename T, typename Mask, typename Expr>
void oneapi_kernel_component_wise_mask(T *dst, Mask const mask,
                                               Expr const expr,
                                               nsimd::nat n,
					       sycl::nd_item<1> item) {

  nsimd::nat i = static_cast<nsimd::nat>(item.get_global_id().get(0));
  if (i < n && mask.gpu_get(i)) {
    dst[i] = expr.gpu_get(i);
  }
}

#else

// CPU component wise kernel
template <typename Pack, typename T, typename Expr>
void cpu_kernel_component_wise(T *dst, Expr const &expr, nsimd::nat n) {
  nsimd::nat i;
  int len = nsimd::len(Pack());
  for (i = 0; i + len < n; i += len) {
    nsimd::storeu(&dst[i], expr.template simd_get<Pack>(i));
  }
  for (; i < n; i++) {
    dst[i] = expr.scalar_get(i);
  }
}

// CPU component wise kernel with masked output
template <typename Pack, typename T, typename Mask, typename Expr>
void cpu_kernel_component_wise_mask(T *dst, Mask const &mask, Expr const &expr,
                                    nsimd::nat n) {
  nsimd::nat i;
  int len = nsimd::len(Pack());
  for (i = 0; i + len < n; i += len) {
    nsimd::storeu(&dst[i], nsimd::if_else(mask.template simd_get<Pack>(i),
                                          expr.template simd_get<Pack>(i),
                                          nsimd::loadu<Pack>(&dst[i])));
  }
  for (; i < n; i++) {
    if (mask.scalar_get(i)) {
      dst[i] = expr.scalar_get(i);
    }
  }
}

#endif

// ----------------------------------------------------------------------------
// helper for computing sizes of 1D vectors

nsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2) {
  assert(sz1 >= 0 || sz2 >= 0);
  assert((sz1 < 0 && sz2 >= 0) || (sz1 >= 0 && sz2 < 0) || (sz1 == sz2));
  if (sz1 < 0) {
    return sz2;
  } else {
    return sz1;
  }
}

nsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2, nsimd::nat sz3) {
  return compute_size(compute_size(sz1, sz2), sz3);
}

// ----------------------------------------------------------------------------
// meta for building a pack from another ignoring the base type

template <typename T, typename Pack> struct to_pack_t {
  static const int unroll = Pack::unroll;
  typedef typename Pack::simd_ext simd_ext;
  typedef nsimd::pack<T, unroll, simd_ext> type;
};

template <typename T, int Unroll, typename SimdExt, typename Pack>
struct to_pack_t<nsimd::pack<T, Unroll, SimdExt>, Pack> {
  static const int unroll = Pack::unroll;
  typedef typename Pack::simd_ext simd_ext;
  typedef nsimd::pack<T, unroll, simd_ext> type;
};

template <typename T, typename Pack> struct to_packl_t {
  static const int unroll = Pack::unroll;
  typedef typename Pack::simd_ext simd_ext;
  typedef nsimd::packl<T, unroll, simd_ext> type;
};

template <typename T, int Unroll, typename SimdExt, typename Pack>
struct to_packl_t<nsimd::pack<T, Unroll, SimdExt>, Pack> {
  static const int unroll = Pack::unroll;
  typedef typename Pack::simd_ext simd_ext;
  typedef nsimd::packl<T, unroll, simd_ext> type;
};

// ----------------------------------------------------------------------------
// scalar node

struct scalar_t {};

template <typename T> struct node<scalar_t, none_t, none_t, T> {
  typedef T in_type;
  typedef T out_type;
  T value;

#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)
  __device__ T gpu_get(nsimd::nat) const { return value; }
#elif defined(NSIMD_ONEAPI)
  T gpu_get(nsimd::nat) const { return value; }
#else
  T scalar_get(nsimd::nat) const { return value; }
  template <typename Pack>
  typename to_pack_t<T, Pack>::type simd_get(nsimd::nat) const {
    typedef typename to_pack_t<T, Pack>::type pack;
    return pack(value);
  }
#endif

  nsimd::nat size() const { return -1; }
};

// ----------------------------------------------------------------------------
// build a node from a scalar and a node

template <typename T> struct to_node_t {
  typedef node<scalar_t, none_t, none_t, T> type;

  static type impl(T n) {
    type ret;
    ret.value = n;
    return ret;
  }
};

template <typename Op, typename Left, typename Right, typename Extra>
struct to_node_t<node<Op, Left, Right, Extra> > {
  typedef node<Op, Left, Right, Extra> type;
  static type impl(type node) { return node; }
};

template <typename T> typename to_node_t<T>::type to_node(T n) {
  return to_node_t<T>::impl(n);
}

// ----------------------------------------------------------------------------
// convert literal to one NSIMD base type

template <typename T> struct literal_to {
  template <typename S> static T impl(S a) { return T(a); }
};

template <> struct literal_to<f16> {
  template <typename S> static f16 impl(S a) {
    return nsimd_f32_to_f16(f32(a));
  }
};

// ----------------------------------------------------------------------------
// input node

struct in_t {};

#define TET1D_IN(T) tet1d::node<tet1d::in_t, tet1d::none_t, tet1d::none_t, T>

template <typename T> struct node<in_t, none_t, none_t, T> {
  const T *data;
  nsimd::nat sz;
  typedef T in_type;
  typedef T out_type;

#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)
  __device__ T gpu_get(nsimd::nat i) const { return data[i]; }
#elif defined(NSIMD_ONEAPI)
  T gpu_get(nsimd::nat i) const { return data[i]; }
#else
  T scalar_get(nsimd::nat i) const { return data[i]; }
  template <typename Pack>
  typename to_pack_t<T, Pack>::type simd_get(nsimd::nat i) const {
    typedef typename to_pack_t<T, Pack>::type pack;
    return nsimd::loadu<pack>(&data[i]);
  }
#endif

  nsimd::nat size() const { return sz; }

  template <typename I0, typename I1>
  node<in_t, none_t, none_t, T> operator()(I0 i0_, I1 i1_) const {
    node<in_t, none_t, none_t, T> ret;
    nsimd::nat i0 = nsimd::nat(i0_);
    nsimd::nat i1 = nsimd::nat(i1_);
    i0 = i0 >= 0 ? i0 : sz + i0;
    i1 = i1 >= 0 ? i1 : sz + i1;
    assert(0 <= i0 && i0 < i1 && i1 < sz);
    ret.data = &data[i0];
    ret.sz = i1 - i0 + 1;
    return ret;
  }
};

// return an input node from a pointer
template <typename T, typename I>
inline node<in_t, none_t, none_t, T> in(const T *data, I sz) {
  node<in_t, none_t, none_t, T> ret;
  ret.data = data;
  ret.sz = nsimd::nat(sz);
  return ret;
}

// ----------------------------------------------------------------------------
// output with condition node: I(I > 50) = ...

struct mask_out_t {};

template <typename Mask, typename Pack>
struct node<mask_out_t, Mask, none_t, Pack> {
  typedef typename Pack::value_type T;
  T *data;
  nsimd::nat threads_per_block;
  void *stream;
  Mask mask;

  template <typename Op, typename Left, typename Right, typename Extra>
  node<mask_out_t, Mask, none_t, Pack>
  operator=(node<Op, Left, Right, Extra> const &expr) {
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
    nsimd::nat expr_size = compute_size(mask.size(), expr.size());
    nsimd::nat nt = threads_per_block < 0 ? 128 : threads_per_block;
    nsimd::nat param = nsimd_kernel_param(expr_size, nt);
    assert(nt > 0 && nt <= UINT_MAX);
    assert(param > 0 && param <= UINT_MAX);
#if defined(NSIMD_CUDA)
    cudaStream_t s = (stream == NULL ? NULL : *(cudaStream_t *)stream);

    // clang-format off
    gpu_kernel_component_wise_mask<<<(unsigned int)(param), (unsigned int)(nt),
                                     0, s>>>
                                     (data, mask, expr, expr_size);
    // clang-format on
#elif defined(NSIMD_ROCM)
    hipStream_t s = stream == NULL ? NULL : *(hipStream_t *)stream;
    hipLaunchKernelGGL(gpu_kernel_component_wise_mask, (unsigned int)(param),
                       (unsigned int)(nt), 0, s, data, mask, expr,
                       expr_size);
#else
    sycl::queue q = nsimd::oneapi::default_queue();
    q.parallel_for(sycl::nd_range<1>(sycl::range<1>((size_t)param),
                                     sycl::range<1>((size_t)nt)),
                   [=, *this](sycl::nd_item<1> item) {
                     oneapi_kernel_component_wise_mask(data, mask, expr,
                                                       expr_size, item);
                   })
        .wait_and_throw();

#endif
#else
    cpu_kernel_component_wise_mask<Pack>(
        data, mask, expr, compute_size(mask.size(), expr.size()));
#endif
    return *this;
  }

  template <typename S> node<mask_out_t, Mask, none_t, Pack> operator=(S a) {
    return operator=(to_node(literal_to<T>::impl(a)));
  }
};

// ----------------------------------------------------------------------------
// output node

struct out_t {};

#define TET1D_OUT(T)                                                          \
  tet1d::node<tet1d::out_t, tet1d::none_t, tet1d::none_t, nsimd::pack<T> >

#define TET1D_OUT_EX(T, N, SimdExt)                                           \
  tet1d::node<tet1d::out_t, tet1d::none_t, tet1d::none_t,                     \
              nsimd::pack<T, N, SimdExt> >

template <typename Pack> struct node<out_t, none_t, none_t, Pack> {
  typedef typename Pack::value_type T;
  T *data;
  nsimd::nat threads_per_block;
  void *stream;

  template <typename Mask>
  node<mask_out_t, Mask, none_t, Pack> operator()(Mask mask) const {
    node<mask_out_t, Mask, none_t, Pack> ret;
    ret.data = data;
    ret.mask = mask;
    ret.threads_per_block = threads_per_block;
    ret.stream = stream;
    return ret;
  }

  template <typename Op, typename Left, typename Right, typename Extra>
  node<out_t, none_t, none_t, Pack>
  operator=(node<Op, Left, Right, Extra> const &expr) {
#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
    nsimd::nat nt = threads_per_block < 0 ? 128 : threads_per_block;
    nsimd::nat param = nsimd_kernel_param(expr.size(), nt);
    assert(nt > 0 && nt <= UINT_MAX);
    assert(param > 0 && param <= UINT_MAX);
#if defined(NSIMD_CUDA)
    cudaStream_t s = stream == NULL ? NULL : *(cudaStream_t *)stream;

    // clang-format off
    gpu_kernel_component_wise<<<(unsigned int)(param), (unsigned int)(nt),
                                0, s>>>(data, expr, expr.size());
    // clang-format on

#elif defined(NSIMD_ROCM)
    hipStream_t s = stream == NULL ? NULL : *(hipStream_t *)stream;
    hipLaunchKernelGGL(
        (gpu_kernel_component_wise<T, node<Op, Left, Right, Extra> >),
        (unsigned int)(param), (unsigned int)(nt), 0, s, data, expr,
        expr.size());
#else
    sycl::queue q = nsimd::oneapi::default_queue();
    q.parallel_for(
         sycl::nd_range<1>(sycl::range<1>((size_t)param),
                                          sycl::range<1>((size_t)nt)),
         [=, *this](sycl::nd_item<1> item) {
           oneapi_kernel_component_wise(data, expr, expr.size(), item);
         })
        .wait_and_throw();
#endif
#else
    cpu_kernel_component_wise<Pack>(data, expr, expr.size());
#endif
    return *this;
  }
};

// return an output node from a pointer
template <typename T>
node<out_t, none_t, none_t, nsimd::pack<T> > out(T *data) {
  node<out_t, none_t, none_t, nsimd::pack<T> > ret;
  ret.data = data;
  ret.threads_per_block = 128;
  ret.stream = NULL;
  return ret;
}

template <typename T, typename Pack>
node<out_t, none_t, none_t, Pack> out(T *data, int threads_per_block,
                                      void *stream) {
  node<out_t, none_t, none_t, Pack> ret;
  ret.data = data;
  ret.threads_per_block = threads_per_block;
  ret.stream = stream;
  return ret;
}

// ----------------------------------------------------------------------------

} // namespace tet1d

#include <nsimd/modules/tet1d/functions.hpp>

#endif


================================================
FILE: include/nsimd/nsimd-all.h
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_ALL_H
#define NSIMD_ALL_H

#include <nsimd/nsimd.h>
#include <nsimd/c_adv_api.h>

#endif


================================================
FILE: include/nsimd/nsimd-all.hpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_ALL_HPP
#define NSIMD_ALL_HPP

#include <nsimd/nsimd.h>
#include <nsimd/cxx_adv_api.hpp>
#include <nsimd/cxx_adv_api_aliases.hpp>
#include <nsimd/friendly_but_not_optimized.hpp>

#endif


================================================
FILE: include/nsimd/nsimd.h
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_H
#define NSIMD_H

/* clang-format off */

/* ------------------------------------------------------------------------- */
/* Compiler detection (order matters https://stackoverflow.com/a/28166605)   */

/* Detect host compiler */
#if defined(_MSC_VER)
  #define NSIMD_IS_MSVC
#elif defined(__ibmxl_version__)
  #define NSIMD_IS_XLC
#elif defined(__FCC_version__)
  #define NSIMD_IS_FCC
#elif defined(__INTEL_COMPILER)
  #define NSIMD_IS_ICC
#elif defined(__clang__)
  #define NSIMD_IS_CLANG
#elif defined(__GNUC__) || defined(__GNUG__)
  #define NSIMD_IS_GCC
#endif

/* Detect device compiler, if any */
#if defined(__HIPCC__)
  #define NSIMD_IS_HIPCC
#elif defined(__INTEL_CLANG_COMPILER) || defined(__INTEL_LLVM_COMPILER)
  #define NSIMD_IS_DPCPP
#elif defined(__NVCC__)
  #define NSIMD_IS_NVCC
#endif

/* ------------------------------------------------------------------------- */
/* C standard detection */

#ifdef NSIMD_IS_MSVC
  #define NSIMD_C 1999
#else
  #ifdef __STDC_VERSION__
    #if __STDC_VERSION__ == 199901L
      #define NSIMD_C 1999
    #elif __STDC_VERSION__ >= 201112L
      #define NSIMD_C 2011
    #else
      #define NSIMD_C 1989
    #endif
  #else
    #define NSIMD_C 1989
  #endif
#endif

/* ------------------------------------------------------------------------- */
/* C++ standard detection */

#ifdef NSIMD_IS_MSVC
  #ifdef _MSVC_LANG
    #define NSIMD__cplusplus _MSVC_LANG
  #else
    #define NSIMD__cplusplus __cplusplus
  #endif
#else
  #ifdef __cplusplus
    #define NSIMD__cplusplus __cplusplus
  #else
    #define NSIMD__cplusplus 0
  #endif
#endif

#if NSIMD__cplusplus > 0 && NSIMD__cplusplus < 201103L
  #define NSIMD_CXX 1998
#elif NSIMD__cplusplus >= 201103L && NSIMD__cplusplus < 201402L
  #define NSIMD_CXX 2011
#elif NSIMD__cplusplus >= 201402L && NSIMD__cplusplus < 201703L
  #define NSIMD_CXX 2014
#elif NSIMD__cplusplus == 201703L
  #define NSIMD_CXX 2017
#elif NSIMD__cplusplus >= 201704L
  #define NSIMD_CXX 2020
#else
  #define NSIMD_CXX 0
#endif

#if NSIMD_CXX >= 2020
  #include <concepts>
  #include <cstddef>
#endif

/* ------------------------------------------------------------------------- */
/* Use of long long for GCC even in C89 and C++98. Note that for some reason */
/* the use of the __extension__ keyword does not prevent warning so we deal  */
/* with them now. We keep the __extension__ keyword in case.                 */

#if NSIMD_CXX < 2011 && NSIMD_C < 1999
  #define NSIMD_LONGLONG_IS_EXTENSION
#endif

#ifdef NSIMD_LONGLONG_IS_EXTENSION
  #if defined(NSIMD_IS_GCC)
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wlong-long"
  #elif defined(NSIMD_IS_CLANG)
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wlong-long"
  #endif
#endif

typedef long long nsimd_longlong;
typedef unsigned long long nsimd_ulonglong;

#if NSIMD_CXX > 0
namespace nsimd {
  typedef long long longlong;
  typedef unsigned long long ulonglong;
} // namespace nsimd
#endif

#ifdef __UINT64_TYPE__
  typedef __UINT64_TYPE__ nsimd_uint64_type;
#endif

#ifdef __INT64_TYPE__
  typedef __INT64_TYPE__ nsimd_int64_type;
#endif

#ifdef NSIMD_LONGLONG_IS_EXTENSION
  #if defined(NSIMD_IS_GCC)
    #pragma GCC diagnostic pop
  #elif defined(NSIMD_IS_CLANG)
    #pragma clang diagnostic pop
  #endif
#endif

/* ------------------------------------------------------------------------- */
/* Register size detection */

#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64__) ||         \
    defined(__amd64) || defined(_M_AMD64) || defined(__aarch64__) ||          \
    defined(_M_ARM64) || defined(__PPC64__)
  #define NSIMD_WORD_SIZE 64
#else
  #define NSIMD_WORD_SIZE 32
#endif

/* ------------------------------------------------------------------------- */
/* Architecture detection */

#if defined(i386) || defined(__i386__) || defined(__i486__) ||                \
    defined(__i586__) || defined(__i686__) || defined(__i386) ||              \
    defined(_M_IX86) || defined(_X86_) || defined(__THW_INTEL__) ||           \
    defined(__I86__) || defined(__INTEL__) || defined(__x86_64) ||            \
    defined(__x86_64__) || defined(__amd64__) || defined(__amd64) ||          \
    defined(_M_X64)
  #define NSIMD_X86
#elif defined(__arm__) || defined(__arm64) || defined(__thumb__) ||           \
    defined(__TARGET_ARCH_ARM) || defined(__TARGET_ARCH_THUMB) ||             \
    defined(_M_ARM) || defined(_M_ARM64) || defined(__arch64__)
  #define NSIMD_ARM
#elif defined(__ppc__) || defined(__powerpc__) || defined(__PPC__)
  #define NSIMD_POWERPC
#else
  #define NSIMD_GENERIC
#endif

/* ------------------------------------------------------------------------- */
/* Microsoft DLL specifics */

#ifdef NSIMD_IS_MSVC
  #define NSIMD_DLLEXPORT __declspec(dllexport)
  #define NSIMD_DLLIMPORT __declspec(dllimport)
#else
  #define NSIMD_DLLEXPORT
  #define NSIMD_DLLIMPORT extern
#endif

/* ------------------------------------------------------------------------- */
/* DLL specifics when inside/outside the library */

#ifdef NSIMD_INSIDE
  #define NSIMD_DLLSPEC NSIMD_DLLEXPORT
#else
  #define NSIMD_DLLSPEC NSIMD_DLLIMPORT
#endif

/* ------------------------------------------------------------------------- */
/* Vector calling convention: https://devblogs.microsoft.com/cppblog
                                  /introducing-vector-calling-convention/ */

#if defined(NSIMD_IS_MSVC) && NSIMD_WORD_SIZE == 32
  #define NSIMD_VECTORCALL __vectorcall
#else
  #define NSIMD_VECTORCALL
#endif

/* ------------------------------------------------------------------------- */
/* inline in nsimd is ONLY useful for linkage */

#if NSIMD_CXX > 0 || NSIMD_C > 1989
  #if NSIMD_C > 0 && defined(NSIMD_IS_MSVC)
    #define NSIMD_INLINE static __inline
  #else
    #define NSIMD_INLINE static inline
  #endif
#else
  #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
    #define NSIMD_INLINE __extension__ static __inline
  #else
    #define NSIMD_INLINE
  #endif
#endif

/* ------------------------------------------------------------------------- */
/* Pre-processor */

#define NSIMD_PP_CAT_2_e(a, b) a##b
#define NSIMD_PP_CAT_2(a, b) NSIMD_PP_CAT_2_e(a, b)

#define NSIMD_PP_CAT_3_e(a, b, c) a##b##c
#define NSIMD_PP_CAT_3(a, b, c) NSIMD_PP_CAT_3_e(a, b, c)

#define NSIMD_PP_CAT_4_e(a, b, c, d) a##b##c##d
#define NSIMD_PP_CAT_4(a, b, c, d) NSIMD_PP_CAT_4_e(a, b, c, d)

#define NSIMD_PP_CAT_5_e(a, b, c, d, e) a##b##c##d##e
#define NSIMD_PP_CAT_5(a, b, c, d, e) NSIMD_PP_CAT_5_e(a, b, c, d, e)

#define NSIMD_PP_CAT_6_e(a, b, c, d, e, f) a##b##c##d##e##f
#define NSIMD_PP_CAT_6(a, b, c, d, e, f) NSIMD_PP_CAT_6_e(a, b, c, d, e, f)

#define NSIMD_PP_EXPAND_e(a) a
#define NSIMD_PP_EXPAND(a) NSIMD_PP_EXPAND_e(a)

/* ------------------------------------------------------------------------- */
/* Detect architecture/SIMD */

#if defined(CPU) && !defined(NSIMD_CPU)
  #define NSIMD_CPU
#endif

/* Intel */

#if defined(SSE2) && !defined(NSIMD_SSE2)
  #define NSIMD_SSE2
#endif

#if defined(SSE42) && !defined(NSIMD_SSE42)
  #define NSIMD_SSE42
#endif

#if defined(AVX) && !defined(NSIMD_AVX)
  #define NSIMD_AVX
#endif

#if defined(AVX2) && !defined(NSIMD_AVX2)
  #define NSIMD_AVX2
#endif

#if defined(AVX512_KNL) && !defined(NSIMD_AVX512_KNL)
  #define NSIMD_AVX512_KNL
#endif

#if defined(AVX512_SKYLAKE) && !defined(NSIMD_AVX512_SKYLAKE)
  #define NSIMD_AVX512_SKYLAKE
#endif

#if defined(FP16) && !defined(NSIMD_FP16)
  #define NSIMD_FP16
#endif

#if defined(FMA) && !defined(NSIMD_FMA)
  #define NSIMD_FMA
#endif

/* ARM */

#if defined(NEON128) && !defined(NSIMD_NEON128)
  #define NSIMD_NEON128
#endif

#if defined(AARCH64) && !defined(NSIMD_AARCH64)
  #define NSIMD_AARCH64
#endif

#if defined(SVE) && !defined(NSIMD_SVE)
  #define NSIMD_SVE
  #define NSIMD_SVE_FAMILY
#endif

#if defined(SVE128) && !defined(NSIMD_SVE128)
  #define NSIMD_SVE128
  #define NSIMD_SVE_FAMILY
#endif

#if defined(SVE256) && !defined(NSIMD_SVE256)
  #define NSIMD_SVE256
  #define NSIMD_SVE_FAMILY
#endif

#if defined(SVE512) && !defined(NSIMD_SVE512)
  #define NSIMD_SVE512
  #define NSIMD_SVE_FAMILY
#endif

#if defined(SVE1024) && !defined(NSIMD_SVE1024)
  #define NSIMD_SVE1024
  #define NSIMD_SVE_FAMILY
#endif

#if defined(SVE2048) && !defined(NSIMD_SVE2048)
  #define NSIMD_SVE2048
  #define NSIMD_SVE_FAMILY
#endif

/* PPC */

#if (defined(VMX) || defined(ALTIVEC)) && !defined(NSIMD_VMX)
#define NSIMD_VMX
#endif

#if defined(VSX) && !defined(NSIMD_VSX)
#define NSIMD_VSX
#endif

/* CUDA */

#if defined(CUDA) && !defined(NSIMD_CUDA)
  #define NSIMD_CUDA
#endif

/* ROCm */

#if defined(ROCM) && !defined(NSIMD_ROCM)
  #define NSIMD_ROCM
#endif

/* oneAPI */

#if defined(ONEAPI) && !defined(NSIMD_ONEAPI)
  #define NSIMD_ONEAPI
  /* undef ONEAPI is needed because ONEAPI is used as a namespace in DPC++:
     sycl::ONEAPI */
  #ifdef ONEAPI
    #undef ONEAPI
  #endif
#endif

/* ------------------------------------------------------------------------- */
/* Set NSIMD_SIMD and NSIMD_PLATFORM macro, include the correct header. */

#if defined(NSIMD_SSE2)

  #define NSIMD_PLATFORM x86
  #define NSIMD_SIMD sse2
  #include <emmintrin.h>
  #if defined(NSIMD_FMA) || defined(NSIMD_FP16)
    #include <immintrin.h>
  #endif
  /* For some reason MSVC <= 2015 has intrinsics defined in another header */
  #ifdef NSIMD_IS_MSVC
    #include <intrin.h>
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct sse2 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::sse2>;
        #define NSIMD_LIST_SIMD_EXT cpu, sse2
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SSE42)

  #define NSIMD_PLATFORM x86
  #define NSIMD_SIMD sse42
  #include <nmmintrin.h>
  #if defined(NSIMD_FMA) || defined(NSIMD_FP16)
    #include <immintrin.h>
  #endif
  /* For some reason MSVC <= 2015 has intrinsics defined in another header */
  #ifdef NSIMD_IS_MSVC
    #include <intrin.h>
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct sse2 {};
      struct sse42 {};
      #if nsIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::sse2> ||
                             std::is_same_v<T, nsimd::sse42>;
        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_AVX)

  #define NSIMD_PLATFORM x86
  #define NSIMD_SIMD avx
  #include <immintrin.h>
  /* For some reason MSVC <= 2015 has intrinsics defined in another header */
  #ifdef NSIMD_IS_MSVC
    #include <intrin.h>
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct sse2 {};
      struct sse42 {};
      struct avx {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::sse2> ||
                             std::is_same_v<T, nsimd::sse42> ||
                             std::is_same_v<T, nsimd::avx>;
        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_AVX2)

  #define NSIMD_PLATFORM x86
  #define NSIMD_SIMD avx2
  #include <immintrin.h>
  /* For some reason MSVC <= 2015 has intrinsics defined in another header */
  #ifdef NSIMD_IS_MSVC
    #include <intrin.h>
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct sse2 {};
      struct sse42 {};
      struct avx {};
      struct avx2 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::sse2> ||
                             std::is_same_v<T, nsimd::sse42> ||
                             std::is_same_v<T, nsimd::avx> ||
                             std::is_same_v<T, nsimd::avx2>;
        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_AVX512_KNL)

  #define NSIMD_PLATFORM x86
  #define NSIMD_SIMD avx512_knl
  #include <immintrin.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct sse2 {};
      struct sse42 {};
      struct avx {};
      struct avx2 {};
      struct avx512_knl {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::sse2> ||
                             std::is_same_v<T, nsimd::sse42> ||
                             std::is_same_v<T, nsimd::avx> ||
                             std::is_same_v<T, nsimd::avx2> ||
                             std::is_same_v<T, nsimd::avx512_knl>;
        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2, avx512_knl
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_AVX512_SKYLAKE)

  #define NSIMD_PLATFORM x86
  #define NSIMD_SIMD avx512_skylake
  #include <immintrin.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct sse2 {};
      struct sse42 {};
      struct avx {};
      struct avx2 {};
      struct avx512_skylake {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::sse2> ||
                             std::is_same_v<T, nsimd::sse42> ||
                             std::is_same_v<T, nsimd::avx> ||
                             std::is_same_v<T, nsimd::avx2> ||
                             std::is_same_v<T, nsimd::avx512_skylake>;
        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2, avx512_skylake
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_NEON128)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD neon128
  #include <arm_neon.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct neon128 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::neon128>;
        #define NSIMD_LIST_SIMD_EXT cpu, neon128
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_AARCH64)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD aarch64
  #include <arm_neon.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SVE)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD sve
  #include <arm_neon.h>
  #include <arm_sve.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      struct sve {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64> ||
                             std::is_same_v<T, nsimd::sve>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SVE128)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD sve128
  #include <arm_neon.h>
  #include <arm_sve.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      struct sve128 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64> ||
                             std::is_same_v<T, nsimd::sve128>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve128
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SVE256)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD sve256
  #include <arm_neon.h>
  #include <arm_sve.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      struct sve256 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64> ||
                             std::is_same_v<T, nsimd::sve256>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve256
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SVE512)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD sve512
  #include <arm_neon.h>
  #include <arm_sve.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      struct sve512 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64> ||
                             std::is_same_v<T, nsimd::sve512>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve512
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SVE1024)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD sve1024
  #include <arm_neon.h>
  #include <arm_sve.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      struct sve1024 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64> ||
                             std::is_same_v<T, nsimd::sve1024>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve1024
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_SVE2048)

  #define NSIMD_PLATFORM arm
  #define NSIMD_SIMD sve2048
  #include <arm_neon.h>
  #include <arm_sve.h>

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct aarch64 {};
      struct sve2048 {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::aarch64> ||
                             std::is_same_v<T, nsimd::sve2048>;
        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve2048
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_VMX)

  #define NSIMD_PLATFORM ppc
  #define NSIMD_SIMD vmx

  #ifdef NSIMD_IS_CLANG
    /* New version of clang are spamming useless warning comming from their */
    /* altivec.h file */
    #pragma clang diagnostic ignored "-Wc11-extensions"
    #pragma clang diagnostic ignored "-Wc++11-long-long"
  #endif

  #include <altivec.h>

  #ifdef bool
    #undef bool
  #endif
  #ifdef pixel
    #undef pixel
  #endif
  #ifdef vector
    #undef vector
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct vmx {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::vmx>;
        #define NSIMD_LIST_SIMD_EXT cpu, vmx
      #endif
    } // namespace nsimd
  #endif

#elif defined(NSIMD_VSX)

  #define NSIMD_PLATFORM ppc
  #define NSIMD_SIMD vsx

  #ifdef NSIMD_IS_CLANG
    /* New version of clang are spamming useless warning comming from their */
    /* altivec.h file */
    #pragma clang diagnostic ignored "-Wc11-extensions"
    #pragma clang diagnostic ignored "-Wc++11-long-long"
  #endif

  #include <altivec.h>

  #ifdef bool
    #undef bool
  #endif
  #ifdef pixel
    #undef pixel
  #endif
  #ifdef vector
    #undef vector
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      struct vmx {};
      struct vsx {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||
                             std::is_same_v<T, nsimd::vmx> ||
                             std::is_same_v<T, nsimd::vsx>;
        #define NSIMD_LIST_SIMD_EXT cpu, vsx
      #endif
    } // namespace nsimd
  #endif

#else

  #ifdef NSIMD_CUDA
    #if defined(NSIMD_IS_GCC)
      #pragma GCC diagnostic push
      #pragma GCC diagnostic ignored "-Wunused-function"
    #elif defined(NSIMD_IS_CLANG)
      #pragma clang diagnostic push
      #pragma clang diagnostic ignored "-Wunused-function"
    #endif
    #include <cuda_fp16.h>
    #if defined(NSIMD_IS_GCC)
      #pragma GCC diagnostic pop
    #elif defined(NSIMD_IS_CLANG)
      #pragma clang diagnostic pop
    #endif
  #endif

  #ifdef NSIMD_ROCM
    #include <hip/hip_fp16.h>
    #include <hip/hip_runtime.h>
  #endif

  #if defined(NSIMD_ONEAPI) && NSIMD_CXX > 0
    #include <CL/sycl.hpp>

    extern "C" {

    NSIMD_DLLSPEC void *nsimd_oneapi_default_queue();

    } // extern "C"

    namespace nsimd {
    namespace oneapi {

    NSIMD_INLINE sycl::queue &default_queue() {
      return *(sycl::queue *)nsimd_oneapi_default_queue();
    }

    } // namespace oneapi
    } // namespace nsimd
  #endif

  #define NSIMD_SIMD cpu
  #define NSIMD_PLATFORM cpu

  #ifdef NSIMD_IS_MSVC
    #include <intrin.h>
  #endif

  #if NSIMD_CXX > 0
    namespace nsimd {
      struct cpu {};
      #if NSIMD_CXX >= 2020
        template <typename T>
        concept simd_ext_c = std::is_same_v<T, nsimd::cpu>;
        #define NSIMD_LIST_SIMD_EXT cpu
      #endif
    } // namespace nsimd
  #endif

#endif

#if NSIMD_CXX >= 2020
  #define NSIMD_CONCEPT_SIMD_EXT nsimd::simd_ext_c
#else
  #define NSIMD_CONCEPT_SIMD_EXT typename
#endif

/* ------------------------------------------------------------------------- */
/* For ARM SVE we need a special struct */

#ifdef NSIMD_SVE
  #define NSIMD_STRUCT __sizeless_struct
#else
  #define NSIMD_STRUCT struct
#endif

/* ------------------------------------------------------------------------- */
/* Shorter typedefs for integers and their limits */

#if NSIMD_CXX > 0
  #include <climits>
#else
  #include <limits.h>
#endif

#if defined(NSIMD_ONEAPI)
  typedef sycl::cl_char    i8;
  typedef sycl::cl_uchar   u8;
  typedef sycl::cl_short   i16;
  typedef sycl::cl_ushort  u16;
  typedef sycl::cl_int     i32;
  typedef sycl::cl_uint    u32;
  typedef sycl::cl_long    i64;
  typedef sycl::cl_ulong   u64;
#elif defined(NSIMD_IS_MSVC)
  typedef unsigned __int8  u8;
  typedef signed   __int8  i8;
  typedef unsigned __int16 u16;
  typedef signed   __int16 i16;
  typedef unsigned __int32 u32;
  typedef signed   __int32 i32;
  typedef unsigned __int64 u64;
  typedef signed   __int64 i64;
#else
  typedef unsigned char  u8;
  typedef signed   char  i8;
  typedef unsigned short u16;
  typedef signed   short i16;
  #ifdef __UINT32_TYPE__
    typedef __UINT32_TYPE__ u32;
  #else
    #if defined(NSIMD_NEON128) && __ARM_ARCH <= 6
      typedef unsigned long u32;
    #else
      typedef unsigned int  u32;
    #endif
  #endif
  #ifdef __INT32_TYPE__
    typedef __INT32_TYPE__  i32;
  #else
    #if defined(NSIMD_NEON128) && __ARM_ARCH <= 6
      typedef signed long   i32;
    #else
      typedef signed int    i32;
    #endif
  #endif
  #if defined(NSIMD_VMX) || defined(NSIMD_VSX)
    typedef nsimd_ulonglong u64;
    typedef nsimd_longlong  i64;
  #elif NSIMD_WORD_SIZE == 64
    #ifdef __UINT64_TYPE__
      typedef nsimd_uint64_type u64;
    #else
      typedef unsigned long     u64;
    #endif
    #ifdef __INT64_TYPE__
      typedef nsimd_int64_type  i64;
    #else
      typedef signed long       i64;
    #endif
  #else
    #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
      typedef nsimd_ulonglong u64;
      typedef nsimd_longlong i64;
    #else
      typedef unsigned long long u64;
      typedef signed long long   i64;
    #endif
  #endif
#endif

#define NSIMD_U8_MIN ((u8)0)
#define NSIMD_U8_MAX UCHAR_MAX
#define NSIMD_I8_MIN SCHAR_MIN
#define NSIMD_I8_MAX SCHAR_MAX
#define NSIMD_U16_MIN ((u16)0)
#define NSIMD_U16_MAX USHRT_MAX
#define NSIMD_I16_MIN SHRT_MIN
#define NSIMD_I16_MAX SHRT_MAX
#define NSIMD_U32_MIN ((u32)0)
#define NSIMD_U32_MAX UINT_MAX
#define NSIMD_I32_MIN INT_MIN
#define NSIMD_I32_MAX INT_MAX

#ifdef NSIMD_IS_MSVC
  #define NSIMD_U64_MIN ((u64)0)
  #define NSIMD_U64_MAX ULLONG_MAX
  #define NSIMD_I64_MIN LLONG_MIN
  #define NSIMD_I64_MAX LLONG_MAX
#else
  #if NSIMD_WORD_SIZE == 64
    #define NSIMD_U64_MIN ((u64)0)
    #define NSIMD_U64_MAX ULONG_MAX
    #define NSIMD_I64_MIN LONG_MIN
    #define NSIMD_I64_MAX LONG_MAX
  #else
    #define NSIMD_U64_MIN ((u64)0)
    #define NSIMD_U64_MAX (~((u64)0))
    #define NSIMD_I64_MIN ((i64)1 << 63)
    #define NSIMD_I64_MAX (~((i64)1 << 63))
  #endif
#endif

/* ------------------------------------------------------------------------- */
/* Shorter typedefs for floatting point types */

#if ((defined(NSIMD_NEON128) || defined(NSIMD_AARCH64)) &&                    \
     defined(NSIMD_FP16)) || defined(NSIMD_SVE_FAMILY)
  #define NSIMD_ARM_FP16
#endif

#ifdef NSIMD_ARM_FP16
  typedef __fp16 f16;
  #define NSIMD_NATIVE_FP16
#elif defined(NSIMD_CUDA) || defined(NSIMD_ROCM)
  typedef __half f16;
  #define NSIMD_NATIVE_FP16
#elif defined(NSIMD_ONEAPI)
  typedef sycl::half f16;
  #define NSIMD_NATIVE_FP16
#else
  typedef struct { u16 u; } f16;
#endif

#if defined(NSIMD_ONEAPI)
  typedef sycl::cl_float f32;
  typedef sycl::cl_double f64;
#else
  typedef float  f32;
  typedef double f64;
#endif

/* ------------------------------------------------------------------------- */
/* Native register size (for now only 32 and 64 bits) types */

#if NSIMD_WORD_SIZE == 64
  typedef i64 nsimd_nat;
#else
  typedef i32 nsimd_nat;
#endif

#if NSIMD_CXX > 0
namespace nsimd {
typedef nsimd_nat nat;
} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* C++ traits for base types */

#if NSIMD_CXX > 0

namespace nsimd {

// Some C++20 concepts first

#if NSIMD_CXX >= 2020
  template <typename T> concept simd_value_type_c =
      std::is_same_v<T, u8> || std::is_same_v<T, i8> ||
      std::is_same_v<T, u16> || std::is_same_v<T, i16> ||
      std::is_same_v<T, u32> || std::is_same_v<T, i32> ||
      std::is_same_v<T, u64> || std::is_same_v<T, i64> ||
      std::is_same_v<T, f16> || std::is_same_v<T, f32> ||
      std::is_same_v<T, f64>;
  #define NSIMD_CONCEPT_VALUE_TYPE nsimd::simd_value_type_c

  template <typename T> concept simd_value_type_or_bool_c =
      simd_value_type_c<T> || std::is_same_v<T, bool>;
  #define NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL nsimd::simd_value_type_or_bool_c

  // We need our own sizeof because of f16 which can be 4 bytes (i.e. a
  // float) on systems where there is no support for native f16.
  template <typename T> struct sizeof_t {
    static const size_t value = sizeof(T);
  };
  template <> struct sizeof_t<f16> { static const size_t value = 2; };

  template <typename T> const size_t sizeof_v = sizeof_t<T>::value;

  #define NSIMD_REQUIRES(cond) requires(cond)
#else
  #define NSIMD_CONCEPT_VALUE_TYPE typename
  #define NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL typename
  #define NSIMD_REQUIRES(cond)
#endif

template <NSIMD_CONCEPT_VALUE_TYPE T> struct traits {};

// 8-bits

template <> struct traits<i8> {
  typedef i8 itype;
  typedef u8 utype;
};

template <> struct traits<u8> {
  typedef i8 itype;
  typedef u8 utype;
};

// 16-bits

template <> struct traits<i16> {
  typedef i16 itype;
  typedef u16 utype;
  typedef f16 ftype;
};

template <> struct traits<u16> {
  typedef i16 itype;
  typedef u16 utype;
  typedef f16 ftype;
};

template <> struct traits<f16> {
  typedef i16 itype;
  typedef u16 utype;
  typedef f16 ftype;
};

// 32-bits

template <> struct traits<i32> {
  typedef i32 itype;
  typedef u32 utype;
  typedef f32 ftype;
};

template <> struct traits<u32> {
  typedef i32 itype;
  typedef u32 utype;
  typedef f32 ftype;
};

template <> struct traits<f32> {
  typedef i32 itype;
  typedef u32 utype;
  typedef f32 ftype;
};

// 64-bits

template <> struct traits<i64> {
  typedef i64 itype;
  typedef u64 utype;
  typedef f64 ftype;
};

template <> struct traits<u64> {
  typedef i64 itype;
  typedef u64 utype;
  typedef f64 ftype;
};

template <> struct traits<f64> {
  typedef i64 itype;
  typedef u64 utype;
  typedef f64 ftype;
};

} // namespace nsimd

#endif

/* ------------------------------------------------------------------------- */
/* Set if denormalized float are set to 0                                    */

#ifdef NSIMD_NEON128
  #define NSIMD_DNZ_FLUSH_TO_ZERO
#endif

/* clang-format on */

/* ------------------------------------------------------------------------- */
/* POPCNT: GCC and Clang have intrinsics */

NSIMD_INLINE int nsimd_popcnt32_(u32 a) {
#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
  return __builtin_popcount(a);
#elif defined(NSIMD_IS_MSVC)
  return (int)__popcnt(a);
#else
  int i, ret = 0;
  for (i = 0; i < 32; i++) {
    ret += (int)((a >> i) & 1);
  }
  return ret;
#endif
}

NSIMD_INLINE int nsimd_popcnt64_(u64 a) {
#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)
#if __SIZEOF_LONG__ == 4
  return __builtin_popcountl((u32)(a & 0xFFFFFFFF)) +
         __builtin_popcountl((u32)(a >> 32));
#else
  return __builtin_popcountl(a);
#endif
#elif defined(NSIMD_IS_MSVC)
  #if NSIMD_WORD_SIZE == 64
    return (int)__popcnt64(a);
  #else
    return (int)__popcnt((u32)(a & 0xFFFFFFFF)) +
           (int)__popcnt((u32)(a >> 32));
  #endif
#else
  int i, ret = 0;
  for (i = 0; i < 64; i++) {
    ret += (int)((a >> i) & 1);
  }
  return ret;
#endif
}

/* ------------------------------------------------------------------------- */
/* Macro to automatically include function depending on detected
   platform/SIMD */

#define NSIMD_AUTO_INCLUDE(path) <nsimd/NSIMD_PLATFORM/NSIMD_SIMD/path>

/* ------------------------------------------------------------------------- */
/* Standard includes */

/* clang-format off */

#if NSIMD_CXX > 0
  #include <cerrno>
  #include <cstdlib>
#else
  #include <errno.h>
  #include <stdlib.h>
#endif

/* clang-format on */

/* ------------------------------------------------------------------------- */
/* Now includes detected SIMD types */

#if NSIMD_CXX > 0

namespace nsimd {
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct simd_traits {};
} // namespace nsimd

// Those are for writing shorter code
#define NSIMD_NSV(T, SIMD_EXT)                                                \
  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vector
#define NSIMD_NSVX2(T, SIMD_EXT)                                              \
  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vectorx2
#define NSIMD_NSVX3(T, SIMD_EXT)                                              \
  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vectorx3
#define NSIMD_NSVX4(T, SIMD_EXT)                                              \
  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vectorx4
#define NSIMD_NSVL(L, SIMD_EXT)                                               \
  typename nsimd::simd_traits<L, SIMD_EXT>::simd_vectorl

#endif

#include NSIMD_AUTO_INCLUDE(types.h)

/* ------------------------------------------------------------------------- */
/* Macro/typedefs for SIMD infos */

#define vec(T) NSIMD_PP_CAT_4(nsimd_, NSIMD_SIMD, _v, T)
#define vecl(T) NSIMD_PP_CAT_4(nsimd_, NSIMD_SIMD, _vl, T)

#define vecx2(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x2)
#define vecx3(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x3)
#define vecx4(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x4)

typedef vec(i8) vi8;
typedef vec(u8) vu8;
typedef vec(i16) vi16;
typedef vec(u16) vu16;
typedef vec(i32) vi32;
typedef vec(u32) vu32;
typedef vec(i64) vi64;
typedef vec(u64) vu64;
typedef vec(f16) vf16;
typedef vec(f32) vf32;
typedef vec(f64) vf64;

typedef vecx2(i8) vi8x2;
typedef vecx2(u8) vu8x2;
typedef vecx2(i16) vi16x2;
typedef vecx2(u16) vu16x2;
typedef vecx2(i32) vi32x2;
typedef vecx2(u32) vu32x2;
typedef vecx2(i64) vi64x2;
typedef vecx2(u64) vu64x2;
typedef vecx2(f16) vf16x2;
typedef vecx2(f32) vf32x2;
typedef vecx2(f64) vf64x2;

typedef vecx3(i8) vi8x3;
typedef vecx3(u8) vu8x3;
typedef vecx3(i16) vi16x3;
typedef vecx3(u16) vu16x3;
typedef vecx3(i32) vi32x3;
typedef vecx3(u32) vu32x3;
typedef vecx3(i64) vi64x3;
typedef vecx3(u64) vu64x3;
typedef vecx3(f16) vf16x3;
typedef vecx3(f32) vf32x3;
typedef vecx3(f64) vf64x3;

typedef vecx4(i8) vi8x4;
typedef vecx4(u8) vu8x4;
typedef vecx4(i16) vi16x4;
typedef vecx4(u16) vu16x4;
typedef vecx4(i32) vi32x4;
typedef vecx4(u32) vu32x4;
typedef vecx4(i64) vi64x4;
typedef vecx4(u64) vu64x4;
typedef vecx4(f16) vf16x4;
typedef vecx4(f32) vf32x4;
typedef vecx4(f64) vf64x4;

typedef vecl(i8) vli8;
typedef vecl(u8) vlu8;
typedef vecl(i16) vli16;
typedef vecl(u16) vlu16;
typedef vecl(i32) vli32;
typedef vecl(u32) vlu32;
typedef vecl(i64) vli64;
typedef vecl(u64) vlu64;
typedef vecl(f16) vlf16;
typedef vecl(f32) vlf32;
typedef vecl(f64) vlf64;

#define vec_a(T, simd_ext) NSIMD_PP_CAT_4(nsimd_, simd_ext, _v, T)
#define vecl_a(T, simd_ext) NSIMD_PP_CAT_4(nsimd_, simd_ext, _vl, T)

#if NSIMD_CXX > 0

namespace nsimd {

/* Alignment tags */
struct aligned {};
struct unaligned {};

#if NSIMD_CXX >= 2020
template <typename T>
concept alignment_c = std::is_same_v<T, aligned> ||
                      std::is_same_v<T, unaligned>;
#define NSIMD_CONCEPT_ALIGNMENT nsimd::alignment_c
#else
#define NSIMD_CONCEPT_ALIGNMENT typename
#endif

#if NSIMD_CXX >= 2011

template <NSIMD_CONCEPT_VALUE_TYPE T>
using simd_vector = typename simd_traits<T, NSIMD_SIMD>::simd_vector;

template <NSIMD_CONCEPT_VALUE_TYPE T>
using simd_vectorl = typename simd_traits<T, NSIMD_SIMD>::simd_vectorl;

#endif

} // namespace nsimd

#endif

/* clang-format off */

#if defined(NSIMD_X86)
  #define NSIMD_MAX_ALIGNMENT 64
#elif defined(NSIMD_ARM)
  #define NSIMD_MAX_ALIGNMENT 256
#elif defined(NSIMD_POWERPC)
  #define NSIMD_MAX_ALIGNMENT 64
#else
  #define NSIMD_MAX_ALIGNMENT 16
#endif

/* TODO: provide C++14 alignment constpexxr */

/* clang-format on */

#define NSIMD_NB_REGISTERS NSIMD_PP_CAT_3(NSIMD_, NSIMD_SIMD, _NB_REGISTERS)

#define NSIMD_MAX_LEN_BIT 2048

#define NSIMD_MAX_LEN_i8 (NSIMD_MAX_LEN_BIT / 8)
#define NSIMD_MAX_LEN_u8 (NSIMD_MAX_LEN_BIT / 8)
#define NSIMD_MAX_LEN_i16 (NSIMD_MAX_LEN_BIT / 16)
#define NSIMD_MAX_LEN_u16 (NSIMD_MAX_LEN_BIT / 16)
#define NSIMD_MAX_LEN_f16 (NSIMD_MAX_LEN_BIT / 16)
#define NSIMD_MAX_LEN_i32 (NSIMD_MAX_LEN_BIT / 32)
#define NSIMD_MAX_LEN_u32 (NSIMD_MAX_LEN_BIT / 32)
#define NSIMD_MAX_LEN_f32 (NSIMD_MAX_LEN_BIT / 32)
#define NSIMD_MAX_LEN_i64 (NSIMD_MAX_LEN_BIT / 64)
#define NSIMD_MAX_LEN_u64 (NSIMD_MAX_LEN_BIT / 64)
#define NSIMD_MAX_LEN_f64 (NSIMD_MAX_LEN_BIT / 64)

#define NSIMD_MAX_LEN_e(typ) NSIMD_MAX_LEN_##typ
#define NSIMD_MAX_LEN(typ) NSIMD_MAX_LEN_e(typ)

#if NSIMD_CXX > 0
namespace nsimd {

template <NSIMD_CONCEPT_VALUE_TYPE T> struct max_len_t {};

template <> struct max_len_t<i8> {
  static const int value = NSIMD_MAX_LEN_BIT / 8;
};
template <> struct max_len_t<u8> {
  static const int value = NSIMD_MAX_LEN_BIT / 8;
};
template <> struct max_len_t<i16> {
  static const int value = NSIMD_MAX_LEN_BIT / 16;
};
template <> struct max_len_t<u16> {
  static const int value = NSIMD_MAX_LEN_BIT / 16;
};
template <> struct max_len_t<f16> {
  static const int value = NSIMD_MAX_LEN_BIT / 16;
};
template <> struct max_len_t<i32> {
  static const int value = NSIMD_MAX_LEN_BIT / 32;
};
template <> struct max_len_t<u32> {
  static const int value = NSIMD_MAX_LEN_BIT / 32;
};
template <> struct max_len_t<f32> {
  static const int value = NSIMD_MAX_LEN_BIT / 32;
};
template <> struct max_len_t<i64> {
  static const int value = NSIMD_MAX_LEN_BIT / 64;
};
template <> struct max_len_t<u64> {
  static const int value = NSIMD_MAX_LEN_BIT / 64;
};
template <> struct max_len_t<f64> {
  static const int value = NSIMD_MAX_LEN_BIT / 64;
};

#if NSIMD_CXX >= 2014
template <NSIMD_CONCEPT_VALUE_TYPE T>
constexpr int max_len = max_len_t<T>::value;
#endif

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Memory functions */

/* clang-format off */

#if NSIMD_CXX > 0
  #include <cstddef>
  #include <new>
  #include <vector>
#endif

/* clang-format on */

/* ------------------------------------------------------------------------- */

#if NSIMD_CXX > 0
extern "C" {
#endif

NSIMD_DLLSPEC void *nsimd_aligned_alloc(nsimd_nat);
NSIMD_DLLSPEC void nsimd_aligned_free(void *);

#if NSIMD_CXX > 0
} // extern "C"
#endif

/* ------------------------------------------------------------------------- */
/* C++ templated functions */

#if NSIMD_CXX > 0
namespace nsimd {

NSIMD_INLINE void *aligned_alloc(nsimd_nat n) {
  return nsimd_aligned_alloc(n);
}

NSIMD_INLINE void aligned_free(void *ptr) {
  nsimd_aligned_free(ptr);
}

template <NSIMD_CONCEPT_VALUE_TYPE T> T *aligned_alloc_for(nsimd_nat n) {
  return (T *)aligned_alloc(n * (nsimd_nat)sizeof(T));
}

template <NSIMD_CONCEPT_VALUE_TYPE T> void aligned_free_for(void *ptr) {
  return aligned_free((T *)ptr);
}

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* C++ <11 allocator */

#if NSIMD_CXX > 0 && NSIMD_CXX < 2011
namespace nsimd {

template <typename T> class allocator {
public:
  typedef T value_type;
  typedef value_type *pointer;
  typedef const value_type *const_pointer;
  typedef value_type &reference;
  typedef const value_type &const_reference;
  typedef std::size_t size_type;
  typedef std::ptrdiff_t difference_type;

public:
  template <typename U> struct rebind { typedef allocator<U> other; };

public:
  allocator() {}
  ~allocator() {}
  allocator(allocator const &) {}

  template <typename U> inline explicit allocator(allocator<U> const &) {}

  pointer address(reference r) { return &r; }
  const_pointer address(const_reference r) { return &r; }

  pointer allocate(size_type n) {
    return reinterpret_cast<pointer>(aligned_alloc_for<T>((nsimd_nat)n));
  }

  pointer allocate(size_type n, const void *) { return allocate(n); }

  void deallocate(pointer p, size_type) { aligned_free_for<T>(p); }

  size_type max_size() const { return size_type(-1) / sizeof(T); }

  void construct(pointer p, const T &t) { new (p) T(t); }
  void destroy(pointer p) { p->~T(); }

  bool operator==(allocator const &) { return true; }
  bool operator!=(allocator const &a) { return !operator==(a); }
};

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* C++ >=11 allocator */

#if NSIMD_CXX >= 2011
namespace nsimd {

template <NSIMD_CONCEPT_VALUE_TYPE T> struct allocator {
  using value_type = T;

  allocator() = default;

  template <typename S> allocator(allocator<S> const &) {}

  T *allocate(std::size_t n) {
    if (n > std::size_t(-1) / sizeof(T)) {
      throw std::bad_alloc();
    }
    T *ptr = aligned_alloc_for<T>((nsimd_nat)n);
    if (ptr != NULL) {
      return ptr;
    }
    throw std::bad_alloc();
  }

  void deallocate(T *ptr, std::size_t) { nsimd::aligned_free(ptr); }
};

template <class T, class S>
bool operator==(allocator<T> const &, allocator<S> const &) {
  return true;
}

template <class T, class S>
bool operator!=(allocator<T> const &, allocator<S> const &) {
  return false;
}

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* scoped allocator */

#if NSIMD_CXX > 0
namespace nsimd {

template <NSIMD_CONCEPT_VALUE_TYPE T> struct scoped_aligned_mem_for {
  std::vector<T, nsimd::allocator<T> > data;

  template <typename I>
#if NSIMD_CXX >= 2020
  requires std::integral<I>
#endif
  scoped_aligned_mem_for(I n) {
    data.resize(size_t(n));
  }

  const T *get() const { return &data[0]; }
  T *get() { return &data[0]; }
};

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Conversion functions f16 <---> f32 for C but only when compiling with a   */
/* host compiler. Otherwise we must have C++ linkage as fp16 types are       */
/* defined as C++ classes . */

#if NSIMD_CXX > 0 && !defined(NSIMD_CUDA) && !defined(NSIMD_ROCM)
  #define NSIMD_C_LINKAGE_FOR_F16
#endif

#ifdef NSIMD_C_LINKAGE_FOR_F16
extern "C" {
#endif

NSIMD_DLLSPEC u16 nsimd_f32_to_u16(f32);
NSIMD_DLLSPEC f32 nsimd_u16_to_f32(u16);

#ifdef NSIMD_ARM_FP16
NSIMD_INLINE f16 nsimd_f32_to_f16(f32 a) { return (f16)a; }
NSIMD_INLINE f32 nsimd_f16_to_f32(f16 a) { return (f32)a; }
#elif (defined(NSIMD_CUDA) && __CUDACC_VER_MAJOR__ >= 10) ||                  \
    defined(NSIMD_ROCM)
inline f16 nsimd_f32_to_f16(f32 a) { return __float2half(a); }
inline f32 nsimd_f16_to_f32(f16 a) { return __half2float(a); }
#elif defined(NSIMD_CUDA) && __CUDACC_VER_MAJOR__ < 10
inline f16 nsimd_f32_to_f16(f32 a) {
  u16 ret = nsimd_f32_to_u16(a);
  return *(__half *)&ret;
}
inline f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(*(u16 *)&a); }
#elif defined(NSIMD_ONEAPI)
inline f16 nsimd_f32_to_f16(f32 a) { return static_cast<sycl::half>(a); }
inline f32 nsimd_f16_to_f32(f16 a) { return static_cast<float>(a); }
#else
NSIMD_DLLSPEC f16 nsimd_f32_to_f16(f32);
NSIMD_DLLSPEC f32 nsimd_f16_to_f32(f16);
#endif

#ifdef NSIMD_C_LINKAGE_FOR_F16
} // extern "C"
#endif

/* ------------------------------------------------------------------------- */
/* Conversion functions f16 <---> f32 for C++ */

#if NSIMD_CXX > 0
namespace nsimd {
NSIMD_DLLSPEC u16 f32_to_u16(f32);
NSIMD_DLLSPEC f32 u16_to_f32(u16);
#ifdef NSIMD_ARM_FP16
NSIMD_INLINE f16 f32_to_f16(f32 a) { return (f16)a; }
NSIMD_INLINE f32 f16_to_f32(f16 a) { return (f32)a; }
#else
NSIMD_DLLSPEC f16 f32_to_f16(f32);
NSIMD_DLLSPEC f32 f16_to_f32(f16);
#endif
} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Helper to print scalar values, converts to bigger type */

NSIMD_INLINE u64 nsimd_to_biggest_u8(u8 a) { return (u64)a; }
NSIMD_INLINE u64 nsimd_to_biggest_u16(u16 a) { return (u64)a; }
NSIMD_INLINE u64 nsimd_to_biggest_u32(u32 a) { return (u64)a; }
NSIMD_INLINE u64 nsimd_to_biggest_u64(u64 a) { return a; }
NSIMD_INLINE i64 nsimd_to_biggest_i8(i8 a) { return (i64)a; }
NSIMD_INLINE i64 nsimd_to_biggest_i16(i16 a) { return (i64)a; }
NSIMD_INLINE i64 nsimd_to_biggest_i32(i32 a) { return (i64)a; }
NSIMD_INLINE i64 nsimd_to_biggest_i64(i64 a) { return a; }
NSIMD_INLINE f64 nsimd_to_biggest_f16(f16 a) {
  return (f64)nsimd_f16_to_f32(a);
}
NSIMD_INLINE f64 nsimd_to_biggest_f32(f32 a) { return (f64)a; }
NSIMD_INLINE f64 nsimd_to_biggest_f64(f64 a) { return a; }

#if NSIMD_CXX > 0
namespace nsimd {
NSIMD_INLINE u64 to_biggest(u8 a) { return nsimd_to_biggest_u8(a); }
NSIMD_INLINE u64 to_biggest(u16 a) { return nsimd_to_biggest_u16(a); }
NSIMD_INLINE u64 to_biggest(u32 a) { return nsimd_to_biggest_u32(a); }
NSIMD_INLINE u64 to_biggest(u64 a) { return nsimd_to_biggest_u64(a); }
NSIMD_INLINE i64 to_biggest(i8 a) { return nsimd_to_biggest_i8(a); }
NSIMD_INLINE i64 to_biggest(i16 a) { return nsimd_to_biggest_i16(a); }
NSIMD_INLINE i64 to_biggest(i32 a) { return nsimd_to_biggest_i32(a); }
NSIMD_INLINE i64 to_biggest(i64 a) { return nsimd_to_biggest_i64(a); }
NSIMD_INLINE f64 to_biggest(f16 a) { return nsimd_to_biggest_f16(a); }
NSIMD_INLINE f64 to_biggest(f32 a) { return nsimd_to_biggest_f32(a); }
NSIMD_INLINE f64 to_biggest(f64 a) { return nsimd_to_biggest_f64(a); }
} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* General conversion for C++ */

#if NSIMD_CXX > 0
namespace nsimd {

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_VALUE_TYPE S>
struct to_helper {
  static T to(T, S value) { return (T)value; }
};

template <> struct to_helper<f16, f16> {
  static f16 to(f16, f16 value) { return value; }
};

template <NSIMD_CONCEPT_VALUE_TYPE S> struct to_helper<f16, S> {
  static f16 to(f16, S value) { return nsimd_f32_to_f16((f32)value); }
};

template <NSIMD_CONCEPT_VALUE_TYPE T> struct to_helper<T, f16> {
  static T to(T, f16 value) { return (T)nsimd_f16_to_f32(value); }
};

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_VALUE_TYPE S>
T to(S value) {
  return to_helper<T, S>::to(T(), value);
}

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* SIMD-related functions */

/* clang-format off */

#if defined(NSIMD_IS_MSVC)
  /* We do not want MSVC to warn us about unary minus on an unsigned type.
     It is well defined in standards: unsigned arithmetic is done modulo
     2^n. */
  #pragma warning(push)
  #pragma warning(disable : 4146)
#elif defined(NSIMD_IS_CLANG) && NSIMD_CXX < 2011
  /* When compiling with Clang with C++98 or C++03, some Intel intrinsics are
     implemented as macros which contain long long but long long are not
     standard. To get rid of a lot of warning we push the corresponding
     warning here. */
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wc++11-long-long"
#elif defined(NSIMD_IS_GCC) && defined(NSIMD_SVE_FAMILY)
  /* Using SVE intrinsics svundef_XXX() is supposed to silence the
     -Wuninitialized warnings but it does not with GCC 10.0 up to GCC 10.2
     so we silence the warning manually for now. */
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wuninitialized"
#elif defined(NSIMD_IS_GCC) && NSIMD_CXX > 0 && \
      (defined(NSIMD_VMX) || defined(NSIMD_VSX))
  /* When compiling POWERPC intrinsics inside C++ code with GCC we get tons of
     -Wunused-but-set-parameter. This is a GCC bug. For now we slience the
     warnings here. */
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
  #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#endif

#include <nsimd/functions.h>

#if defined(NSIMD_IS_MSVC)
  #pragma warning(pop)
#elif defined(NSIMD_IS_CLANG) && NSIMD_CXX < 2011
  #pragma clang diagnostic pop
#elif defined(NSIMD_IS_GCC) && defined(NSIMD_SVE_FAMILY)
  #pragma GCC diagnostic pop
#elif defined(NSIMD_IS_GCC) && NSIMD_CXX > 0 && \
      (defined(NSIMD_VMX) || defined(NSIMD_VSX))
  #pragma GCC diagnostic pop
#endif

/* clang-format on */

/* ------------------------------------------------------------------------- */
/* If_else cannot be auto-generated */

#define vif_else(a0, a1, a2, typel, type)                                     \
  NSIMD_PP_CAT_4(nsimd_if_else1_, NSIMD_SIMD, _, type)                        \
  (NSIMD_PP_CAT_6(nsimd_vreinterpretl_, NSIMD_SIMD, _, type, _, typel)(a0),   \
   a1, a2)

#define vif_else_e(a0, a1, a2, typel, type, simd_ext)                         \
  NSIMD_PP_CAT_4(nsimd_if_else1_, simd_ext, _, type)                          \
  (NSIMD_PP_CAT_6(nsimd_vreinterpretl_, simd_ext, _, type, _, typel)(a0), a1, \
   a2)

#if NSIMD_CXX > 0
namespace nsimd {

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
NSIMD_NSV(T, NSIMD_SIMD)
if_else(NSIMD_NSVL(L, NSIMD_SIMD) a0, NSIMD_NSV(T, NSIMD_SIMD) a1,
        NSIMD_NSV(T, NSIMD_SIMD) a2, L, T) {
  return if_else1(reinterpretl(a0, L(), T(), NSIMD_SIMD()), a1, a2, T(),
                  NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T,
          NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)
NSIMD_NSV(T, SimdExt)
if_else(NSIMD_NSVL(L, SimdExt) a0, NSIMD_NSV(T, SimdExt) a1,
        NSIMD_NSV(T, SimdExt) a2, L, T, SimdExt) {
  return if_else1(reinterpretl(a0, L(), T(), SimdExt()), a1, a2, T(),
                  SimdExt());
}

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Loads/stores can be parametrized/templated by the alignment */

#define NSIMD_ALIGNED a
#define NSIMD_UNALIGNED u

#define vload(a0, type, alignment)                                            \
  NSIMD_PP_CAT_6(nsimd_load, alignment, _, NSIMD_SIMD, _, type)(a0)

#define vload_e(a0, type, simd_ext, alignment)                                \
  NSIMD_PP_CAT_6(nsimd_load, alignment, _, simd_ext, _, type)(a0)

#define vload2(a0, type, alignment)                                           \
  NSIMD_PP_CAT_6(nsimd_load2, alignment, _, NSIMD_SIMD, _, type)(a0)

#define vload2_e(a0, type, simd_ext, alignment)                               \
  NSIMD_PP_CAT_6(nsimd_load2, alignment, _, simd_ext, _, type)(a0)

#define vload3(a0, type, alignment)                                           \
  NSIMD_PP_CAT_6(nsimd_load3, alignment, _, NSIMD_SIMD, _, type)(a0)

#define vload3_e(a0, type, simd_ext, alignment)                               \
  NSIMD_PP_CAT_6(nsimd_load3, alignment, _, simd_ext, _, type)(a0)

#define vload4(a0, type, alignment)                                           \
  NSIMD_PP_CAT_6(nsimd_load4, alignment, _, NSIMD_SIMD, _, type)(a0)

#define vload4_e(a0, type, simd_ext, alignment)                               \
  NSIMD_PP_CAT_6(nsimd_load4, alignment, _, simd_ext, _, type)(a0)

#define vloadl(a0, type, alignment)                                           \
  NSIMD_PP_CAT_6(nsimd_loadl, alignment_, NSIMD_SIMD, _, type)(a0)

#define vloadl_e(a0, type, simd_ext, alignment)                               \
  NSIMD_PP_CAT_6(nsimd_loadl, alignment_, simd_ext, _, type)(a0)

#define vstore(a0, a1, type, alignment)                                       \
  NSIMD_PP_CAT_6(nsimd_store, alignment, _, NSIMD_SIMD, _, type)(a0, a1)

#define vstore_e(a0, a1, type, simd_ext, alignment)                           \
  NSIMD_PP_CAT_6(nsimd_store, alignment, _, simd_ext, _, type)(a0, a1)

#define vstore2(a0, a1, a2, type, alignment)                                  \
  NSIMD_PP_CAT_4(nsimd_store2, alignment, _, NSIMD_SIMD, _, type)(a0, a1, a2)

#define vstore2_e(a0, a1, a2, type, simd_ext, alignment)                      \
  NSIMD_PP_CAT_4(nsimd_store2, alignment, _, simd_ext, _, type)(a0, a1, a2)

#define vstore3(a0, a1, a2, a3, type, alignment)                              \
  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, NSIMD_SIMD, _, type)             \
  (a0, a1, a2, a3)

#define vstore3_e(a0, a1, a2, a3, type, simd_ext, alignment)                  \
  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, simd_ext, _, type)(a0, a1, a2, a3)

#define vstore4(a0, a1, a2, a3, a4, type, alignment)                          \
  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, NSIMD_SIMD, _, type)             \
  (a0, a1, a2, a3, a4)

#define vstore4_e(a0, a1, a2, a3, a4, type, simd_ext, alignment)              \
  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, simd_ext, _, type)               \
  (a0, a1, a2, a3, a4)

#define vstorel(a0, a1, type, alignment)                                      \
  NSIMD_PP_CAT_6(nsimd_storel, alignment, _, NSIMD_SIMD, _, type)(a0, a1)

#define vstorel_e(a0, a1, type, simd_ext, alignment)                          \
  NSIMD_PP_CAT_6(nsimd_storel, alignment, _, simd_ext, _, type)(a0, a1)

#if NSIMD_CXX > 0
namespace nsimd {

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSV(T, NSIMD_SIMD)
load(const T *ptr, T, aligned) {
  return loada(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSV(T, NSIMD_SIMD)
load(const T *ptr, T, unaligned) {
  return loadu(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSV(T, SimdExt)
load(const T *ptr, T, SimdExt, aligned) {
  return loada(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSV(T, SimdExt)
load(const T *ptr, T, SimdExt, unaligned) {
  return loadu(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVX2(T, NSIMD_SIMD)
load2(const T *ptr, T, aligned) {
  return load2a(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVX2(T, NSIMD_SIMD)
load2(const T *ptr, T, unaligned) {
  return load2u(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVX2(T, SimdExt)
load2(const T *ptr, T, SimdExt, aligned) {
  return load2a(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVX2(T, SimdExt)
load2(const T *ptr, T, SimdExt, unaligned) {
  return load2u(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVX3(T, NSIMD_SIMD)
load3(const T *ptr, T, aligned) {
  return load3a(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVX3(T, NSIMD_SIMD)
load3(const T *ptr, T, unaligned) {
  return load3u(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVX3(T, SimdExt)
load3(const T *ptr, T, SimdExt, aligned) {
  return load3a(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVX3(T, SimdExt)
load3(const T *ptr, T, SimdExt, unaligned) {
  return load3u(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVX4(T, NSIMD_SIMD)
load4(const T *ptr, T, aligned) {
  return load4a(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVX4(T, NSIMD_SIMD)
load4(const T *ptr, T, unaligned) {
  return load4u(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVX4(T, SimdExt)
load4(const T *ptr, T, SimdExt, aligned) {
  return load4a(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVX4(T, SimdExt)
load4(const T *ptr, T, SimdExt, unaligned) {
  return load4u(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVL(T, NSIMD_SIMD)
loadlu(const T *ptr, T, aligned) {
  return loadla(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
NSIMD_NSVL(T, NSIMD_SIMD)
loadlu(const T *ptr, T, unaligned) {
  return loadlu(ptr, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVL(T, NSIMD_SIMD)
loadlu(const T *ptr, T, SimdExt, aligned) {
  return loadla(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_NSVL(T, NSIMD_SIMD)
loadlu(const T *ptr, T, SimdExt, unaligned) {
  return loadlu(ptr, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, aligned) {
  storea(ptr, a1, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, unaligned) {
  storeu(ptr, a1, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, aligned) {
  storea(ptr, a1, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, unaligned) {
  storeu(ptr, a1, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store2(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,
            T, aligned) {
  store2a(ptr, a1, a2, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store2(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,
            T, unaligned) {
  store2u(ptr, a1, a2, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store2(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, T,
            SimdExt, aligned) {
  store2a(ptr, a1, a2, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store2(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, T,
            SimdExt, unaligned) {
  store2u(ptr, a1, a2, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store3(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,
            NSIMD_NSV(T, NSIMD_SIMD) a3, T, aligned) {
  store3a(ptr, a1, a2, a3, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store3(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,
            NSIMD_NSV(T, NSIMD_SIMD) a3, T, unaligned) {
  store3u(ptr, a1, a2, a3, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store3(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,
            NSIMD_NSV(T, SimdExt) a3, T, SimdExt, aligned) {
  store3a(ptr, a1, a2, a3, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store3(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,
            NSIMD_NSV(T, SimdExt) a3, T, SimdExt, unaligned) {
  store3u(ptr, a1, a2, a3, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store4(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,
            NSIMD_NSV(T, NSIMD_SIMD) a3, NSIMD_NSV(T, NSIMD_SIMD) a4, T,
            aligned) {
  store4a(ptr, a1, a2, a3, a4, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void store4(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,
            NSIMD_NSV(T, NSIMD_SIMD) a3, NSIMD_NSV(T, NSIMD_SIMD) a4, T,
            unaligned) {
  store4u(ptr, a1, a2, a3, a4, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store4(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,
            NSIMD_NSV(T, SimdExt) a3, NSIMD_NSV(T, SimdExt) a4, T, SimdExt,
            aligned) {
  store4a(ptr, a1, a2, a3, a4, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void store4(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,
            NSIMD_NSV(T, SimdExt) a3, NSIMD_NSV(T, SimdExt) a4, T, SimdExt,
            unaligned) {
  store4u(ptr, a1, a2, a3, a4, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void storel(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, aligned) {
  storela(ptr, a1, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T>
void storel(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, unaligned) {
  storelu(ptr, a1, T(), NSIMD_SIMD());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void storel(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, aligned) {
  storela(ptr, a1, T(), SimdExt());
}

template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
void storel(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, unaligned) {
  storelu(ptr, a1, T(), SimdExt());
}

} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Scalar utilisties */

#include <nsimd/scalar_utilities.h>

/* ------------------------------------------------------------------------- */
/* Some undefs */

#if NSIMD_CXX > 0
#undef NSIMD_NSV
#undef NSIMD_NSVX2
#undef NSIMD_NSVX3
#undef NSIMD_NSVX4
#undef NSIMD_NSVL
#endif

/* ------------------------------------------------------------------------- */
/* isnan, isnormal and isinf functions */

NSIMD_INLINE int nsimd_isnan_f16(f16 a) {
  /* We assume IEEE representation for f16's */
  u16 b = nsimd_scalar_reinterpret_u16_f16(a);
  if ((((((u32)b) >> 10) & 0x1F) == 0x1F) && ((((u32)b) << 6) != 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isnan_f32(f32 a) {
  /* We assume IEEE representation for f32's */
  u32 b = nsimd_scalar_reinterpret_u32_f32(a);
  if ((((b >> 23) & 0xFF) == 0xFF) && ((b << 9) != 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isnan_f64(f64 a) {
  /* We assume IEEE representation for f64's */
  u64 b = nsimd_scalar_reinterpret_u64_f64(a);
  if ((((b >> 52) & 0x7FF) == 0x7FF) && ((b << 12) != 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isinf_f16(f16 a) {
  /* We assume IEEE representation for f16's */
  u16 b = nsimd_scalar_reinterpret_u16_f16(a);
  if ((((((u32)b) >> 10) & 0x1F) == 0x1F) && ((((u32)b) << 6) == 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isinf_f32(f32 a) {
  /* We assume IEEE representation for f32's */
  u32 b = nsimd_scalar_reinterpret_u32_f32(a);
  if ((((b >> 23) & 0xFF) == 0xFF) && ((b << 9) == 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isinf_f64(f64 a) {
  /* We assume IEEE representation for f64's */
  u64 b = nsimd_scalar_reinterpret_u64_f64(a);
  if ((((b >> 52) & 0x7FF) == 0x7FF) && ((b << 12) == 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isnormal_f16(f16 a) {
  /* We assume IEEE representation for f16's */
  u16 b = nsimd_scalar_reinterpret_u16_f16(a);
  if ((((((u32)b) >> 10) & 0x1F) == 0u) && ((((u32)b) << 6) != 0u)) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isnormal_f32(f32 a) {
  /* We assume IEEE representation for f32's */
  u32 b = nsimd_scalar_reinterpret_u32_f32(a);
  if (!((((b >> 23) & 0xFF) == 0u) && ((b << 9) != 0u))) {
    return 1;
  } else {
    return 0;
  }
}

NSIMD_INLINE int nsimd_isnormal_f64(f64 a) {
  /* We assume IEEE representation for f64's */
  u64 b = nsimd_scalar_reinterpret_u64_f64(a);
  if (!((((b >> 52) & 0x7FF) == 0u) && ((b << 12) != 0u))) {
    return 1;
  } else {
    return 0;
  }
}

#if NSIMD_CXX > 0
namespace nsimd {
NSIMD_INLINE int isnan(f16 a) { return nsimd_isnan_f16(a); }
NSIMD_INLINE int isnan(f32 a) { return nsimd_isnan_f32(a); }
NSIMD_INLINE int isnan(f64 a) { return nsimd_isnan_f64(a); }
NSIMD_INLINE int isinf(f16 a) { return nsimd_isinf_f16(a); }
NSIMD_INLINE int isinf(f32 a) { return nsimd_isinf_f32(a); }
NSIMD_INLINE int isinf(f64 a) { return nsimd_isinf_f64(a); }
NSIMD_INLINE int isnormal(f16 a) { return nsimd_isnormal_f16(a); }
NSIMD_INLINE int isnormal(f32 a) { return nsimd_isnormal_f32(a); }
NSIMD_INLINE int isnormal(f64 a) { return nsimd_isnormal_f64(a); }
} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Difference in log UFP, returns an nat, see documentation for more infos   */

#if NSIMD_CXX > 0
extern "C" {
#endif

NSIMD_DLLSPEC int nsimd_ufp_f16(f16, f16);
NSIMD_DLLSPEC int nsimd_ufp_f32(f32, f32);
NSIMD_DLLSPEC int nsimd_ufp_f64(f64, f64);

#if NSIMD_CXX > 0
} // extern "C"
#endif

#if NSIMD_CXX > 0
namespace nsimd {
NSIMD_INLINE int ufp(f16 a, f16 b) { return nsimd_ufp_f16(a, b); }
NSIMD_INLINE int ufp(f32 a, f32 b) { return nsimd_ufp_f32(a, b); }
NSIMD_INLINE int ufp(f64 a, f64 b) { return nsimd_ufp_f64(a, b); }
} // namespace nsimd
#endif

/* ------------------------------------------------------------------------- */
/* Get last kernel parameter */

#if NSIMD_CXX > 0
extern "C" {
#endif

NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat, nsimd_nat);

#if NSIMD_CXX > 0
} // extern "C"
#endif

/* ------------------------------------------------------------------------- */

#endif


================================================
FILE: scripts/FindNSIMD.cmake
================================================
# MIT License
#
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#.rst:
# FindNSIMD
# ---------
#
# Find the NSIMD library, Agenium Scale's vectorization library.
#
# Result variables
# ^^^^^^^^^^^^^^^^
#
# This module will set the following variables in your project:
#
# ``NSIMD_INCLUDE_DIRS``
#   where to find nsimd.h, etc.
# ``NSIMD_LIBRARY_DIRS``
#   where to find the library to link against to use NSIMD.
# ``NSIMD_LIBRARIES``
#   the library to link against to use NSIMD.
# ``NSIMD_FOUND``
#   If false, do not try to use NSIMD.

if (NOT NSIMD_FOUND AND NOT DEFINED NSIMD_LIBRARIES)
  list(LENGTH NSIMD_FIND_COMPONENTS l)
  if ("${l}" STREQUAL "0")
    find_library(NSIMD_LIBRARIES NAMES nsimd_cpu
                                       nsimd_sse2
                                       nsimd_sse42
                                       nsimd_avx
                                       nsimd_avx2
                                       nsimd_avx512_knl
                                       nsimd_avx512_skylake
                                       nsimd_neon128
                                       nsimd_aarch64
                                       nsimd_sve
                                       nsimd_sve128
                                       nsimd_sve256
                                       nsimd_sve512
                                       nsimd_sve1024
                                       nsimd_sve2048
                                       nsimd_cuda
                                       nsimd_rocm)
  elseif("${l}" STREQUAL "1")
    list(GET NSIMD_FIND_COMPONENTS 0 simd_ext)
    find_library(NSIMD_LIBRARIES NAMES nsimd_${simd_ext})
  else()
    if (NOT NSIMD_FIND_QUIETLY)
      message(FATAL_ERROR "cannot handle several components")
    endif()
  endif()
endif()

if (NOT NSIMD_FOUND AND NOT DEFINED NSIMD_INCLUDE_DIRS)
  find_path(NSIMD_INCLUDE_DIRS NAMES nsimd/nsimd.h)
endif()

if (NOT "${NSIMD_INCLUDE_DIRS}" STREQUAL "NSIMD_INCLUDE_DIRS-NOTFOUND" AND
    NOT "${NSIMD_LIBRARIES}" STREQUAL "NSIMD_LIBRARIES-NOTFOUND")
  get_filename_component(NSIMD_LIBRARY_DIRS ${NSIMD_LIBRARIES} DIRECTORY)
  if (NOT NSIMD_FIND_QUIETLY)
    message(STATUS "[include dir = ${NSIMD_INCLUDE_DIRS}]"
                   " [library = ${NSIMD_LIBRARIES}]")
  endif()
  set(NSIMD_FOUND TRUE)
else()
  if (NOT NSIMD_FIND_QUIETLY)
    if (NOT DEFINED NSIMD_INCLUDE_DIRS)
      set(msg "[cannot determine include dir]")
    else()
      set(msg "[include dir = ${NSIMD_INCLUDE_DIRS}]")
    endif()
    if (NOT DEFINED NSIMD_LIBRARIES)
      set(msg "${msg} [cannot determine library dir]")
    else()
      set(msg "${msg} [library = ${NSIMD_LIBRARIES}]")
    endif()
    if (NSIMD_FIND_REQUIRED)
      message(FATAL_ERROR "${msg}")
    else()
      message(STATUS "${msg}")
    endif()
  endif()
  set(NSIMD_FOUND FALSE)
endif()


================================================
FILE: scripts/aarch64-linux-gnu-clang++.sh
================================================
#!/bin/bash

clang++ --target=aarch64-linux-gnu "$@"


================================================
FILE: scripts/aarch64-linux-gnu-clang.sh
================================================
#!/bin/bash

clang --target=aarch64-linux-gnu "$@"


================================================
FILE: scripts/build-tests.bat
================================================
@echo off

REM Copyright (c) 2020 Agenium Scale
REM
REM Permission is hereby granted, free of charge, to any person obtaining a copy
REM of this software and associated documentation files (the "Software"), to deal
REM in the Software without restriction, including without limitation the rights
REM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
REM copies of the Software, and to permit persons to whom the Software is
REM furnished to do so, subject to the following conditions:
REM
REM The above copyright notice and this permission notice shall be included in all
REM copies or substantial portions of the Software.
REM
REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
REM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
REM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
REM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
REM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
REM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
REM SOFTWARE.

REM ###########################################################################

setlocal EnableDelayedExpansion
pushd "%~dp0"

REM ###########################################################################

set BUILD_BAT="%CD%\build.bat"
set HATCH_PY="%CD%\..\egg\hatch.py"
set NSCONFIG="%CD%\..\nstools\bin\nsconfig.exe"
set BUILD_ROOT="%CD%\.."

REM ###########################################################################
REM Run build.bat

call %BUILD_BAT% %*
if errorlevel 1 goto end_nok

REM ###########################################################################
REM Generate NSIMD

python %HATCH_PY% -tf
if errorlevel 1 goto end_nok

REM ###########################################################################
REM Build tests (checks on command line arguments has benn done by build.bat)

set SIMD_EXTS_ARG=%2
set SIMD_EXTS=%SIMD_EXTS_ARG:/=,%
if "%3" == "" (
  set COMPILER_ARG=cl
) else (
  set COMPILER_ARG=%4
)
set COMPILERS=%COMPILER_ARG:/=,%

for %%g in (%COMPILERS%) do (
  for %%h in (%SIMD_EXTS%) do (
    set BUILD_DIR=%BUILD_ROOT%\build-%%h-%%g
    if exist !BUILD_DIR! rd /Q /S !BUILD_DIR!
    md !BUILD_DIR!
    pushd !BUILD_DIR!
      %NSCONFIG% .. -Dsimd=%%h -suite=%%g
      if exist %BUILD_ROOT%\targets.txt (
        set "TS= "
        for /F %%k in ('type %BUILD_ROOT%\targets.txt') do (
          ninja -t targets all | findstr /R "^tests" | findstr /R "%%k" ^
                   >_targets.txt
          for /F %%l in ('type _targets.txt') do (
            set TMP1=%%l
            set T=!TMP1::=!
            set TS=!TS! !T!
          )
        )
      ) else (
        set TS=tests
      )
      echo *** !TS!
      ninja !TS!
    popd
  )
)

REM ###########################################################################

:end_ok
popd
endlocal
exit /B 0

:end_nok
popd
endlocal
exit /B 1


================================================
FILE: scripts/build-tests.sh
================================================
#!/bin/bash
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

###############################################################################

cd `dirname $0`
#set -x
set -e

###############################################################################
# Init

BUILD_SH="${PWD}/build.sh"
HATCH_PY="${PWD}/../egg/hatch.py"
BUILD_ROOT="${PWD}/.."

###############################################################################
# Generate NSIMD tests

python3 --version 1>/dev/null 2>/dev/null && \
  python3 "${HATCH_PY}" -tf || \
  python "${HATCH_PY}" -tf

###############################################################################
# Run build.sh

bash "${BUILD_SH}" "$@" || exit 1

###############################################################################
# Parse command line arguments (check has been done by build.sh)

SIMD_EXTS=`echo "${2}" | sed -e 's,/, ,g'`

if [ "${3}" == "" ]; then
  COMPILER_ARG="gcc"
else
  COMPILER_ARG="${4}"
fi
COMPILERS=`echo ${COMPILER_ARG} | sed 's,/, ,g'`

###############################################################################
# Build tests

for compiler in ${COMPILERS}; do
  for simd_ext in ${SIMD_EXTS}; do
    BUILD_DIR="${BUILD_ROOT}/build-${simd_ext}-${compiler}"
    if [ -e "${BUILD_ROOT}/targets.txt" ]; then
      GLOBS=`cat ${BUILD_ROOT}/targets.txt | tr '\n' '|' | sed 's/|$//g'`
      TARGETS=`(cd ${BUILD_DIR} && ninja -t targets all | grep -E '^tests.') \
               | sed 's/:.*//g' | grep -E "(${GLOBS})" | tr '\n' ' '`
    else
      TARGETS="tests"
    fi
    (cd "${BUILD_DIR}" && ninja ${TARGETS})
  done
done


================================================
FILE: scripts/build.bat
================================================
@echo off

REM Copyright (c) 2020 Agenium Scale
REM
REM Permission is hereby granted, free of charge, to any person obtaining a copy
REM of this software and associated documentation files (the "Software"), to deal
REM in the Software without restriction, including without limitation the rights
REM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
REM copies of the Software, and to permit persons to whom the Software is
REM furnished to do so, subject to the following conditions:
REM
REM The above copyright notice and this permission notice shall be included in all
REM copies or substantial portions of the Software.
REM
REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
REM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
REM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
REM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
REM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
REM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
REM SOFTWARE.

REM ###########################################################################

setlocal EnableDelayedExpansion
pushd "%~dp0"

REM ###########################################################################
REM Init

set SETUP_BAT="%CD%\setup.bat"
set NSCONFIG="%CD%\..\nstools\bin\nsconfig.exe"
set HATCH_PY="%CD%\..\egg\hatch.py"
set BUILD_ROOT="%CD%\.."

REM ###########################################################################
REM Run setup

call %SETUP_BAT%
if errorlevel 1 goto end_nok

REM ###########################################################################
REM Generate NSIMD

python %HATCH_PY% -lf
if errorlevel 1 goto end_nok

REM ###########################################################################
REM Check/parse command line arguments

if "%1" == "" (
  echo %0: usage: %0 for simd_ext1/.../simd_ext2 [with compiler1/.../compiler2]
  goto end_nok
)

if not "%1" == "for" (
  echo ERROR: expected 'for' as first argument
  goto end_nok
)

if "%2" == "" (
  echo "ERROR: no SIMD extension given"
  goto end_nok
)

set SIMD_EXTS_ARG=%2
set SIMD_EXTS=%SIMD_EXTS_ARG:/=,%

if "%3" == "" (
  set COMPILER_ARG=msvc
) else ( if "%3" == "with" (
  if "%4" == "" (
    echo "ERROR: no compiler given after with"
    goto end_nok
  )
  set COMPILER_ARG=%4
) else (
  echo ERROR: expected 'with' as fourth argument
  goto end_nok
) )

set COMPILERS=%COMPILER_ARG:/=,%

REM ###########################################################################
REM Build NSIMD : one build directory per SIMD extension per compiler

for %%g in (%COMPILERS%) do (
  for %%h in (%SIMD_EXTS%) do (
    set BUILD_DIR=%BUILD_ROOT%\build-%%h-%%g
    if exist !BUILD_DIR! rd /Q /S !BUILD_DIR!
    md !BUILD_DIR!
    pushd !BUILD_DIR!
      %NSCONFIG% .. -Dsimd=%%h -suite=%%g
      ninja
    popd
  )
)

REM ###########################################################################

:end_ok
popd
endlocal
exit /B 0

:end_nok
popd
endlocal
exit /B 1


================================================
FILE: scripts/build.sh
================================================
#!/bin/bash
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

###############################################################################

cd `dirname $0`
set -x
set -e

###############################################################################
# Init

SETUP_SH="${PWD}/setup.sh"
NSCONFIG="${PWD}/../nstools/nsconfig/nsconfig"
HATCH_PY="${PWD}/../egg/hatch.py"
BUILD_ROOT="${PWD}/.."

###############################################################################
# Run setup

bash "${SETUP_SH}"

###############################################################################
# Generate NSIMD

python3 --version 1>/dev/null 2>/dev/null && \
  python3 "${HATCH_PY}" -lf || \
  python "${HATCH_PY}" -lf

###############################################################################
# Check/parse command line arguments

if [ "${1}" == "" ]; then
  echo "$0: usage: $0 for simd_ext1,...,simd_ext2 [with compiler]"
  exit 0
fi

if [ "${1}" != "for" ]; then
  echo "ERROR: expected 'for' as first argument"
  exit 1
fi

if [ "${2}" == "" ]; then
  echo "ERROR: no SIMD extension given after 'for'"
  exit 1
fi
SIMD_EXTS=`echo "${2}" | sed -e 's,/, ,g'`

if [ "${3}" == "" ]; then
  COMPILER_ARG="gcc"
elif [ "${3}" == "with" ]; then
  if [ "${4}" == "" ]; then
    echo "ERROR: no compiler given after 'with'"
    exit 1
  fi
  COMPILER_ARG="${4}"
else
  echo "ERROR: expected 'with' as fourth argument"
  exit 1
fi
COMPILERS=`echo ${COMPILER_ARG} | sed 's,/, ,g'`

###############################################################################
# Build NSIMD : one build directory per SIMD extension per compiler

for compiler in ${COMPILERS}; do
  for simd_ext in ${SIMD_EXTS}; do
    BUILD_DIR="${BUILD_ROOT}/build-${simd_ext}-${compiler}"
    rm -rf "${BUILD_DIR}"
    mkdir -p "${BUILD_DIR}"
    (cd "${BUILD_DIR}" && \
        "${NSCONFIG}" .. -Dsimd=${simd_ext} -suite=${compiler})
    (cd "${BUILD_DIR}" && ninja)
  done
done


================================================
FILE: scripts/ci-clang.txt
================================================
camelot.numscale.com (sse2-sse42-clang)
- bash scripts/build-tests.sh for sse2/sse42 with clang
- cd build-sse2-clang
- ../nstools/bin/nstest -j80
- cd ../build-sse42-clang
- ../nstools/bin/nstest -j80

gaunes.numscale.com (avx-avx2-clang)
- bash scripts/build-tests.sh for avx/avx2 with clang
- cd build-avx-clang
- ../nstools/bin/nstest -j80
- cd ../build-avx2-clang
- ../nstools/bin/nstest -j80

caradigan.numscale.com (aarch64-clang-1)
- bash scripts/setup.sh
- python3 egg/hatch.py -ltf
- mkdir build-aarch64-clang
- cd build-aarch64-clang
- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang
- ninja tests.c99 tests.cpp98 tests.cpp11
- ../nstools/bin/nstest -j80

carahes.numscale.com (aarch64-clang-2)
- bash scripts/setup.sh
- python3 egg/hatch.py -ltf
- mkdir build-aarch64-clang
- cd build-aarch64-clang
- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang
- ninja tests.c99 tests.cpp98 tests.cpp11
- ../nstools/bin/nstest -j80

camlann.numscale.com (aarch64-clang-3)
- bash scripts/setup.sh
- python3 egg/hatch.py -ltf
- mkdir build-aarch64-clang
- cd build-aarch64-clang
- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang
- ninja tests.c99 tests.cpp98 tests.cpp11
- ../nstools/bin/nstest -j80


================================================
FILE: scripts/ci-scale.txt
================================================
camelot.hpc.scale <sse2-sse42-gcc> {/home/gquintin}
- mkdir cmake-build-sse2
- cd cmake-build-sse2
- cmake .. -Dsimd=sse2
- make -j10
- cd ..
- mkdir cmake-build-sse42
- cd cmake-build-sse42
- cmake .. -Dsimd=sse42
- make -j10
- cd ..
- bash scripts/build-tests.sh for sse2/sse42 with gcc
- cd build-sse2-gcc
- ../nstools/bin/nstest -j80
- cd ../build-sse42-gcc
- ../nstools/bin/nstest -j80

glastonbury.hpc.scale <avx512_skylake-gcc> {/home/gquintin}
- source /etc/profile.d/modules.sh
- module load cmake/3.1.0
- mkdir cmake-build-avx512_skylake
- cd cmake-build-avx512_skylake
- cmake .. -Dsimd=avx512_skylake
- make -j10
- cd ..
- bash scripts/build-tests.sh for avx512_skylake with gcc
- cd build-avx512_skylake-gcc
- ../nstools/bin/nstest -j40

carduel.hpc.scale <avx512_knl-gcc> {/home/gquintin}
- source /etc/profile.d/profile.sh
- module load cmake/3.1.0
- mkdir cmake-build-avx512_knl
- cd cmake-build-avx512_knl
- cmake .. -Dsimd=avx512_knl
- make -j10
- cd ..
- bash scripts/build-tests.sh for avx512_knl with gcc
- cd build-avx512_knl-gcc
- ../nstools/bin/nstest -j80

gaunes.hpc.scale <avx-avx2-armel-gcc> {/home/gquintin}
- mkdir cmake-build-avx
- cd cmake-build-avx
- cmake .. -Dsimd=avx
- make -j10
- cd ..
- mkdir cmake-build-avx2
- cd cmake-build-avx2
- cmake .. -Dsimd=avx2
- make -j10
- cd ..
- bash scripts/build-tests.sh for avx/avx2 with gcc
- cd build-avx-gcc
- ../nstools/bin/nstest -j80
- cd ../build-avx2-gcc
- ../nstools/bin/nstest -j80
- cd ..
- mkdir cmake-build-armel
- cd cmake-build-armel
- cmake .. -Dsimd=neon128 -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-gcc
- make -j10
- cd ..
- mkdir build-neon128-gcc
- cd build-neon128-gcc
- ../nstools/bin/nsconfig .. -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabi-gcc,5,armel -comp=c++,gcc,arm-linux-gnueabi-g++,5,armel
- ninja tests
- ../nstools/bin/nstest -j80 --prefix="qemu-arm"

logres.hpc.scale <cpu-gcc-cuda-nvcc> {/home/gquintin}
- mkdir cmake-build-cpu
- cd cmake-build-cpu
- cmake .. -Dsimd=cpu
- make -j10
- cd ..
- bash scripts/build-tests.sh for cpu with gcc
- cd build-cpu-gcc
- ../nstools/bin/nstest -j80
- export PATH=${PATH}:/usr/local/cuda/bin
- export LD_LIBRARY_PATH=/usr/local/cuda/lib64
- mkdir ../build-cuda-nvcc
- cd ../build-cuda-nvcc
- ../nstools/bin/nsconfig .. -Dsimd=cuda -Dcuda_arch_flags=-msm_75 -suite=cuda
- ninja tests
- ../nstools/bin/nstest -j20

bowden.hpc.scale <rocm-cpp20-cmakefind> {/home/gquintin}
- bash scripts/build-tests.sh for rocm with rocm
- cd build-rocm-rocm
- ../nstools/bin/nstest -j80
- cd ..
- mkdir build-cpp20
- source /etc/profile.d/profile.sh
- module load gcc/10.2.0
- cd build-cpp20
- ../nstools/bin/nsconfig .. -Dsimd=sse42 -suite=gcc
- ninja tests.cpp20
- ../nstools/bin/nstest -j80
- cd ..
- bash tests/FindNSIMD.cmake.sh

caradigan.hpc.scale <armhf-aarch64-gcc> {/home/gquintin}
- mkdir cmake-build-aarch64
- cd cmake-build-aarch64
- cmake .. -Dsimd=aarch64
- make -j10
- cd ..
- bash scripts/build-tests.sh for aarch64 with gcc
- cd build-aarch64-gcc
- ../nstools/bin/nstest -j80
- cd ..
- mkdir cmake-build-neon128
- cd cmake-build-neon128
- cmake .. -Dsimd=neon128 -DCMAKE_CXX_COMPILER=arm-linux-gnueabihf-gcc -DNSIMD_ARM32_IS_ARMEL=OFF
- make -j10
- cd ..
- mkdir build-neon128-gcc
- cd build-neon128-gcc
- ../nstools/bin/nsconfig .. -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabihf-gcc,5,armhf -comp=c++,gcc,arm-linux-gnueabihf-g++,5,armhf
- ninja tests
- ../nstools/bin/nstest -j80

carahes.hpc.scale <sve128-gcc> {/home/gquintin}
- source /etc/profile.d/profile.sh
- module load gcc/10.2.0
- mkdir cmake-build-sve128
- cd cmake-build-sve128
- cmake .. -Dsimd=sve128
- make -j10
- cd ..
- bash scripts/build-tests.sh for sve128 with gcc
- cd build-sve128-gcc
- module load qemu/4.2.0
- ../nstools/bin/nstest -j80 --prefix="qemu-aarch64 -cpu max,sve-max-vq=1"

WIN.gorre2 <msvc15_32-avx2-msvc19_64> {/home/gquintin} ["C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"]
- setlocal
- call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86
- set PATH=%PATH%;C:\Program Files (x86)\CMake\bin
- md cmake-build32-sse2
- cd cmake-build32-sse2
- cmake .. -Dsimd=sse2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- md cmake-build32-sse42
- cd cmake-build32-sse42
- cmake .. -Dsimd=sse42 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- md cmake-build32-avx
- cd cmake-build32-avx
- cmake .. -Dsimd=avx -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- md cmake-build32-avx2
- cd cmake-build32-avx2
- cmake .. -Dsimd=avx2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- call scripts\build for sse2/sse42/avx/avx2 with msvc
- endlocal
- setlocal
- call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"
- md cmake-build64-sse2
- cd cmake-build64-sse2
- cmake .. -Dsimd=sse2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- md cmake-build64-sse42
- cd cmake-build64-sse42
- cmake .. -Dsimd=sse42 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- md cmake-build64-avx
- cd cmake-build64-avx
- cmake .. -Dsimd=avx -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- md cmake-build64-avx2
- cd cmake-build64-avx2
- cmake .. -Dsimd=avx2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles"
- nmake
- cd ..
- call scripts\build-tests for avx2 with msvc
- cd build-avx2-msvc
- ..\nstools\bin\nstest -j60
- endlocal

couillere <aarch64-macos> {/Users/gquintin}
- export PATH=${PATH}:/opt/homebrew/bin
- python3 egg/hatch.py -ltf
- bash scripts/setup.sh
- mkdir build-aarch64-xcode
- cd build-aarch64-xcode
- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -suite=llvm -Dmpfr="-I/opt/homebrew/include -L/opt/homebrew/lib -lmpfr"
- ninja
- ninja tests
- ../nstools/bin/nstest -j16


================================================
FILE: scripts/ci-test.txt
================================================
couillere <aarch64-macos> {/Users/gquintin}
- export PATH=${PATH}:/opt/homebrew/bin
- python3 egg/hatch.py -ltf
- bash scripts/setup.sh
- mkdir build-aarch64-xcode
- cd build-aarch64-xcode
- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -suite=llvm -Dmpfr="-I/opt/homebrew/include -L/opt/homebrew/lib -lmpfr"
- ninja
- ninja tests
- ../nstools/bin/nstest -j16


================================================
FILE: scripts/ci.sh
================================================
#!/bin/sh
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

###############################################################################
# Argument parsing

if [ "$2" == "" ]; then
  echo "ERROR: usage: $0 JOBS_FILE NSTOOLS_CHECKOUT_LAST_COMMIT"
  exit 1
fi

JOBS_FILE="`realpath $1`"
NSIMD_NSTOOLS_CHECKOUT_LATER="$2"

cd `dirname $0`
#set -x
set -e

###############################################################################
# Init

SSH="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
         -o LogLevel=error"
SCP="scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
         -o LogLevel=error"
GIT_URL=`git remote get-url origin`
GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
TMP_DIR="${PWD}/../_ci"
ONE_LINER_C="${PWD}/../scripts/one-liner.c"
SSHJOB_C="${PWD}/../nstools/sshjob/sshjob.c"

# Empty tmp directory
if [ -f "${JOBS_FILE}" ]; then
  rm -rf "${TMP_DIR}"
  mkdir -p "${TMP_DIR}"
fi

###############################################################################
# Build jobs scripts

if [ -f "${JOBS_FILE}" ]; then

  CURRENT_JOB=""
  DESC=""
  REMOTE_HOST="Linux"
  
  while read -r line; do
  
    # Empty lines
    if [ "`echo ${line} | sed 's/[ \t]*//g'`" == "" ]; then
      continue
    fi

    # Comments
    if [ "`echo ${line} | cut -c 1`" == "#" ]; then
      continue
    fi
  
    if [ "`echo ${line} | cut -c 1`" == "-" ]; then
      echo "`echo ${line} | cut -c 2- | sed 's/^  *//g'`" >>"${CURRENT_JOB}"
      echo >>"${CURRENT_JOB}"
    else
      ADDR=`echo ${line} | sed -e 's/<.*//g' -e 's/  *//g'`
      DESC=`echo ${line} | sed -e 's/.*<//g' -e 's/>.*//g'`
      REMOTE_DIR=`echo ${line} | sed -e 's/.*{//g' -e 's/}.*//g'`
      EXTRA=`echo ${line} | sed -e 's/.*\[//g' -e 's/].*//g'`
      REMOTE_HOST=`echo ${ADDR} | head -c 4`
      echo ${REMOTE_DIR} >"${TMP_DIR}/${ADDR}--${DESC}.work.dir"
      if [ "${REMOTE_HOST}" == "WIN." ]; then
        CURRENT_JOB="${TMP_DIR}/${ADDR}--${DESC}.bat" # <-- this must be before
        ADDR="`echo ${ADDR} | tail -c +5`"            # <-- this
        REMOTE_HOST="Windows"
        cat >"${CURRENT_JOB}" <<-EOF
	@echo off

	setlocal
	pushd "%~dp0"
	
	set NSTOOLS_CHECKOUT_LAST_COMMIT="${NSTOOLS_CHECKOUT_LAST_COMMIT}"

	if exist ci-nsimd-${DESC} rd /Q /S ci-nsimd-${DESC}
	git clone ${GIT_URL} ci-nsimd-${DESC}
	git -C ci-nsimd-${DESC} checkout ${GIT_BRANCH}

	pushd ci-nsimd-${DESC}

	REM ----------------------------------------------------------------
	REM User commands from here

	EOF
        # On Windows we need a native compiler. On Linux we have cc in the
        # PATH but on Windows we have nothing. We need a MSVC but there is
        # no easy way to find one. So we parse what is between [...] which
        # contains the path to the vcvarsall.bat script to load the compiler
        cat >"${TMP_DIR}/${ADDR}--${DESC}-native-cl" <<-EOF
	@echo off
	
	setlocal
	call ${EXTRA}
	cl %*
	exit /B %ERRORLEVEL%
	EOF
      else
        CURRENT_JOB="${TMP_DIR}/${ADDR}--${DESC}.sh"
        REMOTE_HOST="Linux"
        cat >"${CURRENT_JOB}" <<-EOF
	#!/bin/sh
	
	cd \`dirname \$0\`
	set -e

        export NSTOOLS_CHECKOUT_LAST_COMMIT="${NSTOOLS_CHECKOUT_LAST_COMMIT}"

	rm -rf ci-nsimd-${DESC}
	git clone ${GIT_URL} ci-nsimd-${DESC}
	git -C ci-nsimd-${DESC} checkout ${GIT_BRANCH}

	cd ci-nsimd-${DESC}

	# ------------------------------------------------------------------
	# User commands from here

	EOF
      fi
    fi

  done <"${JOBS_FILE}"

fi

###############################################################################
# Launch jobs

if [ -f "${JOBS_FILE}" ]; then

  echo "-- NSIMD CI"
  echo "-- "
  echo "-- Initialization:"
  
  for job in `find ${TMP_DIR} -iregex '.*\.\(bat\|sh\)'`; do
    ADDR=`basename ${job} | \
          sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/--.*//g'`
    DESC=`basename ${job} | \
          sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/.*--//g'`
    REMOTE_DIR="`cat ${TMP_DIR}/${ADDR}--${DESC}.work.dir`"
    W_REMOTE_DIR="`echo ${REMOTE_DIR} | tr / \\\\\\`"
    REMOTE_HOST=`echo ${ADDR} | head -c 4`
    if [ "${REMOTE_HOST}" == "WIN." ]; then
      REMOTE_HOST="Windows"
      ADDR="`echo ${ADDR} | tail -c +5`"
    else
      REMOTE_HOST="Linux"
    fi
    echo "-- Found new job: ${DESC}"
    echo "--   Remote machine will be: ${ADDR}"
    if [ "${REMOTE_HOST}" == "Windows" ]; then
      echo "--   Working directory will be: ${W_REMOTE_DIR}"
      ${SSH} ${ADDR} if not exist ${W_REMOTE_DIR} md ${W_REMOTE_DIR}
    else
      echo "--   Working directory will be: ${REMOTE_DIR}"
      ${SSH} ${ADDR} mkdir -p ${REMOTE_DIR}
    fi
    echo "--   Launching commands"
    if [ "${REMOTE_HOST}" == "Windows" ]; then
      ${SCP} ${job} ${ADDR}:${W_REMOTE_DIR}
      ${SCP} ${ONE_LINER_C} ${ADDR}:${W_REMOTE_DIR}
      ${SCP} ${SSHJOB_C} ${ADDR}:${W_REMOTE_DIR}
      ${SCP} ${TMP_DIR}/${ADDR}--${DESC}-native-cl \
             ${ADDR}:${W_REMOTE_DIR}\\native-cl.bat
      ${SSH} ${ADDR} "cd ${W_REMOTE_DIR} & \
                      native-cl /Ox /W3 /D_CRT_SECURE_NO_WARNINGS one-liner.c"
      ${SSH} ${ADDR} "cd ${W_REMOTE_DIR} & \
                      native-cl /Ox /W3 /D_CRT_SECURE_NO_WARNINGS sshjob.c"
      ${SSH} ${ADDR} "cd ${W_REMOTE_DIR} & \
                      sshjob run \"`basename ${job}` 2>&1 | \
                             one-liner ci-nsimd-${DESC}-output.txt \
                                       ci-nsimd-${DESC}-one-liner.txt\"" \
             | sed 's/\r//g' >${TMP_DIR}/ci-nsimd-${DESC}-pid.txt
    else
      ${SCP} ${job} ${ADDR}:${REMOTE_DIR}
      ${SCP} ${ONE_LINER_C} ${ADDR}:${REMOTE_DIR}
      ${SCP} ${SSHJOB_C} ${ADDR}:${REMOTE_DIR}
      ${SSH} ${ADDR} "cd ${REMOTE_DIR} && cc -O2 one-liner.c -o one-liner"
      ${SSH} ${ADDR} "cd ${REMOTE_DIR} && cc -O2 sshjob.c -o sshjob"
      ${SSH} ${ADDR} "cd ${REMOTE_DIR} && \
                      ./sshjob run \"bash `basename ${job}` 2>&1 | \
                               ./one-liner ci-nsimd-${DESC}-output.txt \
                                           ci-nsimd-${DESC}-one-liner.txt\"" \
             >${TMP_DIR}/ci-nsimd-${DESC}-pid.txt
    fi
  done

  sleep 2

fi

###############################################################################
# Build associative arrays

REMOTE_HOST_A=""
ADDR_A=""
DESC_A=""
ONE_LINER_A=""
KILL_COMMAND_A=""
LOG_A=""
N=0

for job in `find ${TMP_DIR} -iregex '.*\.\(bat\|sh\)'`; do
  ADDR=`basename ${job} | \
        sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/--.*//g'`
  DESC=`basename ${job} | \
        sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/.*--//g'`
  REMOTE_DIR="`cat ${TMP_DIR}/${ADDR}--${DESC}.work.dir`"
  W_REMOTE_DIR="`echo ${REMOTE_DIR} | tr / \\\\\\`"
  LOG="${REMOTE_DIR}/ci-nsimd-${DESC}-output.txt"
  REMOTE_HOST="`echo ${ADDR} | head -c 4`"
  PID="`sed -e 's/\r//g' ${TMP_DIR}/ci-nsimd-${DESC}-pid.txt`"
  if [ "${REMOTE_HOST}" == "WIN." ]; then
    REMOTE_HOST="Windows"
    ADDR="`echo ${ADDR} | tail -c +5`"
    ONE_LINER="${W_REMOTE_DIR}\\ci-nsimd-${DESC}-one-liner.txt"
    KILL_COMMAND="${W_REMOTE_DIR}\\sshjob kill ${PID}"
  else
    REMOTE_HOST="Linux"
    ONE_LINER="${REMOTE_DIR}/ci-nsimd-${DESC}-one-liner.txt"
    KILL_COMMAND="${REMOTE_DIR}/sshjob kill ${PID}"
  fi

  ADDR_A="${ADDR_A}${ADDR}:"
  DESC_A="${DESC_A}${DESC}:"
  ONE_LINER_A="${ONE_LINER_A}${ONE_LINER}:"
  KILL_COMMAND_A="${KILL_COMMAND_A}${KILL_COMMAND}:"
  LOG_A="${LOG_A}${LOG}:"
  REMOTE_HOST_A="${REMOTE_HOST_A}${REMOTE_HOST}:"
  N=`expr ${N} + 1`
done

get_a() {
  echo ${1} | cut -f${2} -d':'
}

###############################################################################
# Monitor jobs (main event loop)

if [ -d "${JOBS_FILE}" ]; then
  TMP_DIR="${JOBS_FILE}"
fi

trap "stty echo icanon; exit 0" SIGINT
stty -echo -icanon
clear
key=""
selected=1

echo2() {
 printf "%-${COLUMNS}s" " "
 printf "\r"
 echo "${1}"
}

while true; do
  if [ "${selected}" -gt "${N}" ]; then
    selected=${N}
  fi
  if [ "${selected}" -lt "1" ]; then
    selected=1
  fi

  # Display part
  tput cup 0 0
  key=""
  echo2
  echo2 "[q] quit         [D] download outputs and quit  [T] kill all jobs"
  echo2 "[j] select next  [k] select previous            [t] kill selected job"
  echo2 "                 [d] see selected job log"
  echo2
  for i in `seq 1 ${N}`; do
    (
      ADDR=`get_a ${ADDR_A} ${i}`
      ONE_LINER=`get_a ${ONE_LINER_A} ${i}`
      REMOTE_HOST=`get_a ${REMOTE_HOST_A} ${i}`
      if [ "${REMOTE_HOST}" == "Windows" ]; then
        STATUS=`${SSH} ${ADDR} "if exist ${ONE_LINER} type ${ONE_LINER}" \
                       || true`
      else
        STATUS=`${SSH} ${ADDR} "[ -f ${ONE_LINER} ] && cat ${ONE_LINER}" \
                       || true`
      fi
      echo ${STATUS} >${TMP_DIR}/one-liner-${i}.txt
    ) </dev/null &
    read -t 0.01 -n 1 key || true
    if [ "${key}" != "" ]; then
      break
    fi
  done
  if [ "${key}" == "" ]; then
    wait
    for i in `seq 1 ${N}`; do
      ADDR=`get_a ${ADDR_A} ${i}`
      DESC=`get_a ${DESC_A} ${i}`
      #if [ "${REMOTE_HOST}" == "Windows" ]; then
      #  STATUS=`${SSH} ${ADDR} "if exist ${ONE_LINER} type ${ONE_LINER}" \
      #                 </dev/null || true`
      #else
      #  STATUS=`${SSH} ${ADDR} "[ -f ${ONE_LINER} ] && cat ${ONE_LINER}" \
      #                 </dev/null || true`
      #fi
      STATUS=`cat ${TMP_DIR}/one-liner-${i}.txt`
      if [ "${i}" == "${selected}" ]; then
        echo2 "++++  ${i}: ${ADDR}, ${DESC}  ++++"
      else
        echo2 "${i}: ${ADDR}, ${DESC}"
      fi
      W=`expr ${COLUMNS} - 4`
      echo2 "    `echo ${STATUS} | cut -c 1-${W}`"
      echo2
      read -t 0.01 -n 1 key || true
      if [ "${key}" != "" ]; then
        break
      fi
    done
  fi

  # Keyboard input part
  if [ "${key}" == "" ]; then
    read -t 1 -n 1 key || true
  fi
  if [ "${key}" == "" ]; then
    continue
  fi
  if [ "${key}" == "q" ]; then
    break
  fi
  if [ "${key}" == "j" ]; then
    selected=`expr ${selected} + 1`
    continue
  fi
  if [ "${key}" == "k" ]; then
    selected=`expr ${selected} - 1`
    continue
  fi
  if [ "`echo ${key} | grep [123456789]`" != "" ]; then
    selected=${key}
    continue
  fi
  if [ "${key}" == "t" ]; then
    ADDR=`get_a ${ADDR_A} ${selected}`
    KILL_COMMAND=`get_a ${KILL_COMMAND_A} ${selected}`
    ${SSH} ${ADDR} ${KILL_COMMAND}
    clear
    continue
  fi
  if [ "${key}" == "T" ]; then
    clear
    echo
    echo "Terminating every job..."
    echo
    for i in `seq 1 ${N}`; do
      ADDR=`get_a ${ADDR_A} ${i}`
      KILL_COMMAND=`get_a ${KILL_COMMAND_A} ${i}`
      ${SSH} ${ADDR} ${KILL_COMMAND}
    done
    echo
    echo "...done"
    echo
    break
  fi
  if [ "${key}" == "d" ]; then
    ADDR=`get_a ${ADDR_A} ${selected}`
    LOG=`get_a ${LOG_A} ${selected}`
    echo DEBUG: ${SCP} ${ADDR}:${LOG} ${TMP_DIR}/log.txt
    ${SCP} ${ADDR}:${LOG} ${TMP_DIR}/log.txt
    less ${TMP_DIR}/log.txt
    clear
    continue
  fi
  if [ "${key}" == "D" ]; then
    clear
    echo
    echo "Downloading every log..."
    echo
    for i in `seq 1 ${N}`; do
      ADDR=`get_a ${ADDR_A} ${i}`
      LOG=`get_a ${LOG_A} ${i}`
      ${SCP} ${ADDR}:${LOG} ${TMP_DIR}
    done
    echo
    echo "...done"
    echo
    break
  fi
done

stty echo icanon
exit 0


================================================
FILE: scripts/compile-gmp-mpfr-for-wasm.sh
================================================
#!/bin/sh
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

cd `dirname $0`

set -e
set -x

PREFIX="${PWD}/../_wasm-deps"
rm -rf ${PREFIX}
mkdir -p ${PREFIX}
J=`nproc || echo 10`

# -----------------------------------------------------------------------------
# GMP first

if [ "$1" != "" ]; then
  VER=$1
else
  VER=6.2.1
fi

URL=https://gmplib.org/download/gmp/gmp-${VER}.tar.xz

curl -L ${URL} -o gmp.tar.xz
tar xf gmp.tar.xz
(cd gmp-${VER} && \
 emconfigure ./configure --disable-assembly \
                         --host none \
                         --enable-cxx \
                         --prefix=${PREFIX} && \
 make -j${J} &&
 make install)

# -----------------------------------------------------------------------------
# MPFR first

if [ "$1" != "" ]; then
  VER=$1
else
  VER=4.1.0
fi

URL=https://www.mpfr.org/mpfr-current/mpfr-${VER}.tar.gz

curl -L ${URL} -o mpfr.tar.xz
tar xf mpfr.tar.xz
(cd mpfr-${VER} && \
 emconfigure ./configure --disable-assembly \
                         --host none \
                         --with-gmp=${PREFIX} \
                         --prefix=${PREFIX} && \
 make -j${J} &&
 make install)

# -----------------------------------------------------------------------------
# Echo nsconfig parameters to compile for WebAssembly

echo
echo
echo
echo
echo "+---------------------------------------------------------+"
echo
echo "Compilation of MPFR + GMP is ok."
echo "Invocation of nsconfig to compile for WebAssembly"
echo
echo "CPU emulation:"
echo
echo "../nstools/bin/nsconfig .. -Dsimd=cpu \\"
echo "    -Dmpfr=\"-I${PREFIX}/include -L${PREFIX}/lib -lmpfr -lgmp\""
echo
echo "WASM SIMD128:"
echo
echo "../nstools/bin/nsconfig .. -Dsimd=wasm_simd128 \\"
echo "    -Dmpfr=\"-I${PREFIX}/include -L${PREFIX}/lib -lmpfr -lgmp\""
echo
echo "+---------------------------------------------------------+"
echo


================================================
FILE: scripts/gen_github_doc.sh
================================================
#!/bin/sh
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

###############################################################################

cd `dirname $0`
set -x
set -e

###############################################################################
# Init

GH_PAGES_DIR="${PWD}/gh-pages"
HTML_DOC_DIR="${PWD}/../doc/html"
EGG_DIR="${PWD}/../egg"

###############################################################################
# Generates HTML documentation

rm -f "${HTML_DOC_DIR}/*.html"
python3 "${EGG_DIR}/hatch.py" -fd

###############################################################################
# Put all HTML files into the gh-pages branch of NSIMD

rm -rf "${GH_PAGES_DIR}"
git clone git@github.com:agenium-scale/nsimd.git "${GH_PAGES_DIR}"
git -C "${GH_PAGES_DIR}" checkout gh-pages
git -C "${GH_PAGES_DIR}" rm -f '*.html'
cp ${HTML_DOC_DIR}/*.html ${GH_PAGES_DIR}
mkdir -p ${GH_PAGES_DIR}/assets
cp ${HTML_DOC_DIR}/assets/*.js ${GH_PAGES_DIR}/assets
git -C "${GH_PAGES_DIR}" add '*.html'
git -C "${GH_PAGES_DIR}" add 'assets/*.js'
git -C "${GH_PAGES_DIR}" commit -m "Documentation auto-generated on `date`"
git -C "${GH_PAGES_DIR}" push
rm -rf "${GH_PAGES_DIR}"


================================================
FILE: scripts/hipcc.sh
================================================
#!/bin/bash

/opt/rocm/bin/hipcc -D__HIPCC__ -D__hcc_major__=3 -D__hcc_minor__=10 "$@"


================================================
FILE: scripts/init-benches-deps.sh
================================================
#!/bin/sh

## The top-level dir
ROOT_DIR="$( git rev-parse --show-toplevel )"

## Where all the deps are gonna be installed
INSTALL_DIR="${ROOT_DIR}/_install"

get() {
    URL="$1"
    DEST="$2"
    ## Shift to consume all remaining arguments for cmake
    shift; shift;
    ## Get the repo
    git clone ${URL} ${DEST}
    ## Prepare build
    cd ${DEST}
    mkdir -p build
    cd build
    ## Make sure to install in the INSTALL_DIR dir
    cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" "$@"
    cmake --build . --target install 
}

## Prepare deps dir + Cleanup
rm -rf ${INSTALL_DIR}
mkdir -p _deps
mkdir -p _install/lib _install/include

## MIPP
git clone https://github.com/aff3ct/MIPP.git _deps/mipp
cp -rfv _deps/mipp/src/* ${INSTALL_DIR}/include

## Sleef
get https://github.com/shibatch/sleef.git _deps/sleef -DBUILD_TESTS=OFF -DBUILD_DFT=OFF

## Benchmark
get https://github.com/google/benchmark _deps/benchmark -DBENCHMARK_ENABLE_TESTING=OFF


================================================
FILE: scripts/local-ci-rerun.ini
================================================
# -----------------------------------------------------------------------------
# Intel CPU/SIMD

[sse2,sse42,avx,avx2]

NSTEST -jNPROC

[avx512_knl]

module load sde/8.69.1-2021-07-18
NSTEST --prefix="sde64 -knl --" -jNPROC

[avx512_skylake]

module load sde/8.69.1-2021-07-18
NSTEST --prefix="sde64 -skx --" -jNPROC

# -----------------------------------------------------------------------------
# Arm

[aarch64]

module load qemu/6.1.0
NSTEST --prefix="qemu-aarch64" -jNPROC

[sve128]

module load clang/13.0.0
module load qemu/6.1.0
NSTEST --prefix="qemu-aarch64 -cpu max,sve-max-vq=1" -jNPROC

[armel]

module load qemu/6.1.0
NSTEST --prefix="qemu-arm" -jNPROC

[armhf]

module load qemu/6.1.0
NSTEST --prefix="qemu-arm" -jNPROC

# -----------------------------------------------------------------------------
# PowerPC

[vmx]

module load clang/13.0.0
module load qemu/6.1.0
NSTEST --prefix="qemu-ppc64le -cpu power8" -jNPROC

[vsx]

module load clang/13.0.0
module load qemu/6.1.0
NSTEST --prefix="qemu-ppc64le -cpu power8" -jNPROC

# -----------------------------------------------------------------------------
# Intel oneAPI

[oneapi]

source /opt/intel/oneapi/setvars.sh
NSTEST -jNPROC


================================================
FILE: scripts/local-ci.ini
================================================
# -----------------------------------------------------------------------------
# Intel CPU/SIMD

[sse2,sse42,avx,avx2]

NSCONFIG -Dsimd=SIMD_EXT -suite=gcc SRC_DIR
ninja TARGET
NSTEST -jNPROC

[avx512_knl]

NSCONFIG -Dsimd=SIMD_EXT -suite=gcc SRC_DIR
ninja TARGET
module load sde/8.69.1-2021-07-18
NSTEST --prefix="sde64 -knl --" -jNPROC

[avx512_skylake]

NSCONFIG -Dsimd=SIMD_EXT -suite=gcc SRC_DIR
ninja TARGET
module load sde/8.69.1-2021-07-18
NSTEST --prefix="sde64 -skx --" -jNPROC

# -----------------------------------------------------------------------------
# Arm

[aarch64]

module load clang/13.0.0
NSCONFIG -Dsimd=SIMD_EXT \
         -comp=cc,clang,SRC_DIR/scripts/aarch64-linux-gnu-clang.sh,13,aarch64 \
         -comp=c++,clang,SRC_DIR/scripts/aarch64-linux-gnu-clang++.sh,13,aarch64 \
         SRC_DIR
ninja TARGET
module load qemu/6.1.0
NSTEST --prefix="qemu-aarch64" -jNPROC

[sve128]

module load aarch64-linux-gnu/11.2.0
NSCONFIG -Dsimd=SIMD_EXT \
         -comp=cc,gcc,aarch64-linux-gnu-gcc,11,aarch64 \
         -comp=c++,gcc,aarch64-linux-gnu-g++,11,aarch64 SRC_DIR
ninja TARGET
module load qemu/6.1.0
NSTEST --prefix="qemu-aarch64 -cpu max,sve-max-vq=1" -jNPROC

[armel]

NSCONFIG -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabi-gcc,6,armel \
                        -comp=c++,gcc,arm-linux-gnueabi-g++,6,armel SRC_DIR
ninja TARGET
module load qemu/6.1.0
NSTEST --prefix="qemu-arm" -jNPROC

[armhf]

NSCONFIG -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabihf-gcc,6,armhf \
                        -comp=c++,gcc,arm-linux-gnueabihf-g++,6,armhf SRC_DIR
ninja TARGET
module load qemu/6.1.0
NSTEST --prefix="qemu-arm" -jNPROC

# -----------------------------------------------------------------------------
# PowerPC

[vmx]

module load clang/13.0.0
NSCONFIG -Dsimd=vmx \
         -comp=cc,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang.sh,7,ppc64el \
         -comp=c++,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang++.sh,7,ppc64el \
         SRC_DIR
ninja TARGET
module load qemu/6.1.0
NSTEST --prefix="qemu-ppc64le -cpu power8" -jNPROC

[vsx]

module load clang/13.0.0
NSCONFIG -Dsimd=vsx \
         -comp=cc,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang.sh,7,ppc64el \
         -comp=c++,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang++.sh,7,ppc64el \
         SRC_DIR
ninja TARGET
module load qemu/6.1.0
NSTEST --prefix="qemu-ppc64le -cpu power8" -jNPROC

# -----------------------------------------------------------------------------
# Intel oneAPI

[oneapi]

source /opt/intel/oneapi/setvars.sh
NSCONFIG -Dsimd=SIMD_EXT -suite=oneapi SRC_DIR
ninja TARGET
NSTEST -jNPROC

# -----------------------------------------------------------------------------
# NVIDIA CUDA (cannot be emulated, or at least I don't know how)

[cuda]

NSCONFIG -Dsimd=SIMD_EXT -suite=cuda SRC_DIR
ninja TARGET

# -----------------------------------------------------------------------------
# AMD HIP/ROCm (can be emulated with HIP-CPU) but as of now (2021/10/07) the
# library is marked as "Please note the library is being actively developed,
# and is known to be incomplet; it might also be incorrekt and there could be a
# few bad bugs lurking." so that I will wait for a first release.

[rocm]

NSCONFIG -Dsimd=SIMD_EXT -suite=rocm SRC_DIR
ninja TARGET


================================================
FILE: scripts/local-ci.sh
================================================
#!/bin/sh

# -----------------------------------------------------------------------------
# Init

INPUT="`realpath ${1}`"
TARGET="${2}"
cd `dirname $0`
ROOT="${PWD}/../build-local-ci"
mkdir -p "${ROOT}"
NPROC=`nproc`
if [ "${TARGET}" == "" ]; then
  TARGET="tests"
fi

# -----------------------------------------------------------------------------
# Make sure we have generated nsimd

python3 "${PWD}/../egg/hatch.py" -ltf

# -----------------------------------------------------------------------------
# Make sure we have the latest commit for nsconfig

NSCONFIG="${PWD}/../nstools/nsconfig/nsconfig"
NSTEST="${PWD}/../nstools/nsconfig/nstest"

[ -e "${NSCONFIG}" ] || ( export NSTOOLS_CHECKOUT_LAST_COMMIT=1 && \
                          bash "${PWD}/../scripts/setup.sh" )
[ -e "${NSTEST}" ] || ( export NSTOOLS_CHECKOUT_LAST_COMMIT=1 && \
                        bash "${PWD}/../scripts/setup.sh" )

# -----------------------------------------------------------------------------
# Parse input file

SIMD_EXTS=""

while read -r line; do

  # Empty lines
  if [ "`echo ${line} | sed 's/[ \t]*//g'`" == "" ]; then
    continue
  fi

  # Comments
  if [ "`echo ${line} | cut -c 1`" == "#" ]; then
    continue
  fi

  # New architectures
  if [ "`echo ${line} | cut -c 1`" == "[" ]; then
    SIMD_EXTS="`echo ${line} | sed -e 's/[][,]/ /g'`"
    for s in ${SIMD_EXTS}; do
      echo '#!/bin/bash' >"${ROOT}/run-${s}.sh"
      echo >>"${ROOT}/run-${s}.sh"
      echo 'cd `dirname $0`' >>"${ROOT}/run-${s}.sh"
      echo "mkdir -p ${s}" >>"${ROOT}/run-${s}.sh"
      echo "cd ${s}" >>"${ROOT}/run-${s}.sh"
      echo >>"${ROOT}/run-${s}.sh"
    done
    continue
  fi

  # Standard line (part of a script)
  if [ "${SIMD_EXTS}" != "" ]; then
    for s in ${SIMD_EXTS}; do
      echo ${line} | sed -e "s,SIMD_EXT,${s},g" \
                         -e "s,SRC_DIR,${PWD}/..,g" \
                         -e "s,NSCONFIG,${NSCONFIG},g" \
                         -e "s,NSTEST,${NSTEST},g" \
                         -e "s,NPROC,${NPROC},g" \
                         -e "s,TARGET,${TARGET},g" \
                         >>"${ROOT}/run-${s}.sh"
    done
  fi
 
done <"${INPUT}"

# -----------------------------------------------------------------------------
# Compile all tests

for i in ${ROOT}/*.sh; do
  ( bash ${i} || true ) | tee ${i}.log
done


================================================
FILE: scripts/one-liner.c
================================================
/*

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

/* ------------------------------------------------------------------------- */

/*

This program needs to be as portable as possible as it is intended for
Windows hosts with an unknown version of Visual Studio. It is compiled
before running the tests of NSIMD.

Its purpose is to read stdin and put all into an accumulator file and from
time to time (approximatively every second) put a line of text into another
file.

*/

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <errno.h>
#include <string.h>

#define DO(cmd, error_code, goto_label_on_error)                              \
  do {                                                                        \
    errno = 0;                                                                \
    if ((cmd) == error_code) {                                                \
      fprintf(stderr, "%s: error: " #cmd ": %s\n", argv[0], strerror(errno)); \
      ret = -1;                                                               \
      goto goto_label_on_error;                                               \
    }                                                                         \
  } while (0)

int main(int argc, char **argv) {
  FILE *acc, *one = NULL;
  char *buf;
  int ret = 0;
  size_t n = 1024;
  time_t tick;

  if (argc != 3) {
    fprintf(stderr, "%s: ERROR: usage: one-liner acc.txt one-liner.txt",
            argv[0]);
    return -1;
  }

  DO(acc = fopen(argv[1], "wb"), NULL, end);
  DO(buf = malloc(n), NULL, free_acc);

  tick = time(NULL);
  for (;;) {
    time_t t;
    size_t i = 0;
    int end_of_file = 0;

    for (;;) {
      int code = fgetc(stdin);
      if (code == EOF || code == '\n') {
        buf[i] = '\n';
        buf[i + 1] = 0;
        end_of_file = (code == EOF);
        break;
      }
      buf[i] = (char)code;
      if (i >= n - 2) {
        n = n * 2;
        DO(buf = realloc(buf, n), NULL, free_buf);
      }
      i++;
    }

    DO(fputs(buf, acc), EOF, free_buf);
    DO(fflush(acc), EOF, free_buf);
    t = time(NULL);
    if (t - tick >= 1) {
      DO(one = fopen(argv[2], "wb"), NULL, free_buf);
      DO(fputs(buf, one), EOF, free_one);
      DO(fflush(one), EOF, free_one);
      DO(fclose(one), EOF, free_one);
      one = NULL;
      tick = t;
    }

    if (end_of_file) {
      break;
    }
  }

  DO(one = fopen(argv[2], "wb"), NULL, free_buf);
  DO(fputs("Finished", one), EOF, free_one);
  DO(fflush(one), EOF, free_one);

free_one:
  if (one != NULL && fclose(one) == EOF) {
    fprintf(stderr, "%s: NOTE: error on closing '%s': %s\n", argv[0], argv[2],
            strerror(errno));
  }

free_buf:
  free(buf);

free_acc:
  if (fclose(acc) == EOF) {
    fprintf(stderr, "%s: NOTE: error on closing '%s': %s\n", argv[0], argv[1],
            strerror(errno));
  }

end:
  return ret;
}


================================================
FILE: scripts/powerpc64le-linux-gnu-clang++.sh
================================================
#!/bin/bash

clang++ --target=powerpc64le-linux-gnu \
        -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-gnu "$@"


================================================
FILE: scripts/powerpc64le-linux-gnu-clang.sh
================================================
#!/bin/bash

clang --target=powerpc64le-linux-gnu \
      -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-gnu "$@"


================================================
FILE: scripts/setup.bat
================================================
@echo off

REM Copyright (c) 2020 Agenium Scale
REM
REM Permission is hereby granted, free of charge, to any person obtaining a copy
REM of this software and associated documentation files (the "Software"), to deal
REM in the Software without restriction, including without limitation the rights
REM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
REM copies of the Software, and to permit persons to whom the Software is
REM furnished to do so, subject to the following conditions:
REM
REM The above copyright notice and this permission notice shall be included in all
REM copies or substantial portions of the Software.
REM
REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
REM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
REM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
REM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
REM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
REM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
REM SOFTWARE.

REM ###########################################################################

setlocal EnableDelayedExpansion
pushd "%~dp0"

REM ###########################################################################
REM Init

set NSTOOLS_DIR="%CD%\..\nstools"

REM ###########################################################################
REM Pull nstools

if exist "%NSTOOLS_DIR%\README.md" (
  pushd %NSTOOLS_DIR%
  git pull || cd .
  popd
) else (
  if exist "..\.git" (
    git remote get-url origin >_tmp-nsimd-url.txt
    set /P NSIMD_URL=<_tmp-nsimd-url.txt
    set NSTOOLS_URL=!NSIMD_URL:nsimd=nstools!
    del /F /Q _tmp-nsimd-url.txt
    pushd ".."
    git clone !NSTOOLS_URL! nstools
    popd
  ) else (
    pushd ".."
    git clone "https://github.com/agenium-scale/nstools.git" nstools
    popd
  )
)

if "%NSTOOLS_CHECKOUT_LAST_COMMIT%" == "" (
  git -C %NSTOOLS_DIR% checkout v3.0
) else (
  git -C %NSTOOLS_DIR% checkout master
)

REM ###########################################################################
REM Create bin directory

if not exist %NSTOOLS_DIR%\bin (
  md %NSTOOLS_DIR%\bin
)

REM ###########################################################################
REM Build nsconfig (if not already built)

pushd %NSTOOLS_DIR%\nsconfig
nmake /F Makefile.win nsconfig.exe
nmake /F Makefile.win nstest.exe
copy /Y "nsconfig.exe" %NSTOOLS_DIR%\bin
copy /Y "nstest.exe" %NSTOOLS_DIR%\bin
popd

popd
endlocal
exit /B 0


================================================
FILE: scripts/setup.sh
================================================
#!/bin/bash
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

###############################################################################

cd `dirname $0`
set -x
set -e

###############################################################################
# Init

NSTOOLS_DIR="${PWD}/../nstools"

###############################################################################
# Build nsconfig (if not already built)

[ -d "${NSTOOLS_DIR}" ] || \
    ( cd "${PWD}/.." && \
      ( [ -d .git ] \
        && ( git clone `git remote get-url origin | sed s/nsimd/nstools/g` ) \
        || ( git clone "https://github.com/agenium-scale/nstools.git" ) ) )

if [ "${NSTOOLS_CHECKOUT_LAST_COMMIT}" == "" ]; then
  git -C "${NSTOOLS_DIR}" checkout v3.0
else
  git -C "${NSTOOLS_DIR}" checkout master
  git -C "${NSTOOLS_DIR}" pull
fi

( cd "${NSTOOLS_DIR}/nsconfig" && \
  make -B -j8 -f Makefile.nix nsconfig && \
  make -B -j8 -f Makefile.nix nstest )


================================================
FILE: src/dd.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))
typedef struct {
  vdouble x, y;
} vdouble2;

static vdouble  vd2getx_vd_vd2(vdouble2 v) { return v.x; }
static vdouble  vd2gety_vd_vd2(vdouble2 v) { return v.y; }
static vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y)  { vdouble2 v; v.x = x; v.y = y; return v; }
static vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; }
static vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; }
#endif

static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) {
  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));
}

static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
  return vd2setxy_vd2_vd_vd(h, l);
}

static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {
  return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l));
}

static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
  return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)),
			    vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
}

static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {
  return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0),
			    vsel_vd_vo_d_d(o, y1, y0));
}

static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
  return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);
}

static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
  return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);
}

static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
  return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);
}

static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
  return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);
}

static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
  return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);
}

static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
  return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);
}

static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
  return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);
}

static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
  return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);
}

static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
  return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);
}

//

static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {
  return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x)));
}

static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {
  return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)),
			 vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)),
							  vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)),
									vreinterpret_vm_vd(vcast_vd_d(-0.0))))));
}

static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t)));
}

static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
  return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(x, y);
  vdouble v = vsub_vd_vd_vd(s, x);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x)));
}

static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x)));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
  vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x)));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
  vdouble v = vsub_vd_vd_vd(s, x);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)),
							   vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y)));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  // |x| >= |y|

  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
}

static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
  vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))));
}

static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
  // |x| >= |y|

  vdouble s = vsub_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
}

static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  // |x| >= |y|

  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s);
  t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y));
  t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x));
  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y)));
}

#ifdef ENABLE_FMA_DP
static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
  vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s);
  vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1)));
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u)));
}

static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble s = vmul_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s));
}

static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s)));
}

static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s))));
}

static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y))));
}

static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))));
}

static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s)));
}

static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
  vdouble s = vrec_vd_vd(d);
  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1))));
}

static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
  vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d));
  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1)))));
}
#else
static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
  vdouble dh  = vupper_vd_vd(vd2getx_vd_vd2(d)), dl  = vsub_vd_vd_vd(vd2getx_vd_vd2(d),  dh);
  vdouble th  = vupper_vd_vd(t  ), tl  = vsub_vd_vd_vd(t  ,  th);
  vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh);

  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);

  vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),
		    vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));

  return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u));
}

static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
  vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);
  vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);

  vdouble s = vmul_vd_vd_vd(x, y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)));
}

static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble yh = vupper_vd_vd(y  ), yl = vsub_vd_vd_vd(y  , yh);

  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
  return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y)));
}

static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);

  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y))));
}

static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);

  return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));
}

static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);

  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x)))));
}

static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);

  return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));
}

static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
  vdouble t = vrec_vd_vd(d);
  vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);
  vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);

  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
}

static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
  vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
  vdouble th = vupper_vd_vd(t  ), tl = vsub_vd_vd_vd(t  , th);

  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t))));
}
#endif

static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
  vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));
  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
}

static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {
  vdouble t = vsqrt_vd_vd(d);
  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
}


================================================
FILE: src/df.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))
typedef struct {
  vfloat x, y;
} vfloat2;

static vfloat  vf2getx_vf_vf2(vfloat2 v) { return v.x; }
static vfloat  vf2gety_vf_vf2(vfloat2 v) { return v.y; }
static vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y)  { vfloat2 v; v.x = x; v.y = y; return v; }
static vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; }
static vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; }
#endif

static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) {
  return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));
}

static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
  return vf2setxy_vf2_vf_vf(h, l);
}

static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {
  return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l));
}

static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) {
  return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d));
}

static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
  return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
}

static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {
  return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0));
}

static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));
}

static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));
}

static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {
  return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))),
			 vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));
}

static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
  return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);
}

static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
  return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);
}

static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
  return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);
}

static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {
  return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);
}

static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {
  return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);
}

static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
  return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);
}

static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
  return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);
}

static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
  return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);
}

//

static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {
  return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x)));
}

static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {
  return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)),
			 vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));
}

static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t)));
}

static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
  return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(x, y);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(x, y);
  vfloat v = vsub_vf_vf_vf(s, x);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
  vfloat v = vsub_vf_vf_vf(s, x);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y)));

}

static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x)));
}

static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y);
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x)));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  // |x| >= |y|

  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
}

static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v));
  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))));
}

static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
  // |x| >= |y|

  vfloat s = vsub_vf_vf_vf(x, y);
  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
}

static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  // |x| >= |y|

  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s);
  t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y));
  t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x));
  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y)));
}

#ifdef ENABLE_FMA_SP
static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
  vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s);
  vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1)));
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u)));
}

static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat s = vmul_vf_vf_vf(x, y);
  return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s));
}

static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s)));
}

static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x))));
}

static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s))));
}

static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y))));
}

static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y);
  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s)));
}

static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
  vfloat s = vrec_vf_vf(d);
  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1))));
}

static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
  vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d));
  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1)))));
}
#else
static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
  vfloat dh  = vupper_vf_vf(vf2getx_vf_vf2(d)), dl  = vsub_vf_vf_vf(vf2getx_vf_vf2(d),  dh);
  vfloat th  = vupper_vf_vf(t  ), tl  = vsub_vf_vf_vf(t  ,  th);
  vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh);

  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);

  vfloat u, w;
  w = vcast_vf_f(-1);
  w = vmla_vf_vf_vf_vf(dh, th, w);
  w = vmla_vf_vf_vf_vf(dh, tl, w);
  w = vmla_vf_vf_vf_vf(dl, th, w);
  w = vmla_vf_vf_vf_vf(dl, tl, w);
  w = vneg_vf_vf(w);

  u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s));
  u = vmla_vf_vf_vf_vf(nhh, tl, u);
  u = vmla_vf_vf_vf_vf(nhl, th, u);
  u = vmla_vf_vf_vf_vf(nhl, tl, u);
  u = vmla_vf_vf_vf_vf(s, w, u);

  return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u));
}

static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
  vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);
  vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);

  vfloat s = vmul_vf_vf_vf(x, y), t;

  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(xl, yh, t);
  t = vmla_vf_vf_vf_vf(xh, yl, t);
  t = vmla_vf_vf_vf_vf(xl, yl, t);

  return vf2setxy_vf2_vf_vf(s, t);
}

static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat yh = vupper_vf_vf(y  ), yl = vsub_vf_vf_vf(y  , yh);

  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t;

  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(xl, yh, t);
  t = vmla_vf_vf_vf_vf(xh, yl, t);
  t = vmla_vf_vf_vf_vf(xl, yl, t);
  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t);

  return vf2setxy_vf2_vf_vf(s, t);
}

static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);

  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t;

  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(xl, yh, t);
  t = vmla_vf_vf_vf_vf(xh, yl, t);
  t = vmla_vf_vf_vf_vf(xl, yl, t);
  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t);
  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t);

  return vf2setxy_vf2_vf_vf(s, t);
}

static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);

  return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));
}

static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);

  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t;

  t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s));
  t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);
  t = vmla_vf_vf_vf_vf(xl, xl, t);
  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t);

  return vf2setxy_vf2_vf_vf(s, t);
}

static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);

  return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));
}

static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
  vfloat t = vrec_vf_vf(d);
  vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);
  vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);

  vfloat u = vcast_vf_f(-1);
  u = vmla_vf_vf_vf_vf(dh, th, u);
  u = vmla_vf_vf_vf_vf(dh, tl, u);
  u = vmla_vf_vf_vf_vf(dl, th, u);
  u = vmla_vf_vf_vf_vf(dl, tl, u);

  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
}

static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
  vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
  vfloat th = vupper_vf_vf(t  ), tl = vsub_vf_vf_vf(t  , th);

  vfloat u = vcast_vf_f(-1);
  u = vmla_vf_vf_vf_vf(dh, th, u);
  u = vmla_vf_vf_vf_vf(dh, tl, u);
  u = vmla_vf_vf_vf_vf(dl, th, u);
  u = vmla_vf_vf_vf_vf(dl, tl, u);
  u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u);

  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
}
#endif

static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
#ifdef ENABLE_RECSQRT_SP
  vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
  vfloat2 r = dfmul_vf2_vf2_vf(d, x);
  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));
#else
  vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));
#endif
}

static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {
  vfloat t = vsqrt_vf_vf(d);
  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));
}


================================================
FILE: src/estrin.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

// These are macros for evaluating polynomials using Estrin's method

#define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0))
#define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0)))
#define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0)))
#define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
#define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
#define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
  MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))


================================================
FILE: src/fp16.cpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

/*

We follow IEEE754-2008 for FP16 (= binary16) storage.
However IEEE754 compliance is not guaranteed by C/C++ standards
and therefore we propose two modes:

- IEEE754 mode with NaNs, INFs, ... (this is the default)
- non IEEE754 mode compatible with only C89 (no NaNs, INFs...)

FP16 format
-----------

    +---+--------+--------------+
    | S | E EEEE | MM MMMM MMMM |
    +---+--------+--------------+
     15  14   10   9          0

FP16 interpretation
-------------------

S = sign bit
E = exponent bits (offset is 15), emin = -14, emax = 15
M = mantissa bits

E == 0 and M != 0 => subnormal => (-1)^S x 2^(-14) x (0 + 2^(-10) x T)
32 > E >  0       =>    normal => (-1)^S x 2^(E - 15) x (1 + 2^(-10) x T)

FP32 format
-----------

    +---+-----------+------------------------------+
    | S | EEEE EEEE | MMM MMMM MMMM MMMM MMMM MMMM |
    +---+-----------+------------------------------+
     31  30      23  22                          0

FP32 interpretation
-------------------

S = sign bit
E = exponent bits (offset is 127), emin = -126, emax = 127
M = mantissa bits

E == 0 and M != 0 => subnormal => (-1)^S x 2^(-126) x (0 + 2^(-23) x T)
256 > E > 0       =>    normal => (-1)^S x 2^(E - 127) x (1 + 2^(-23) x T)

In both cases we treat subnormal numbers as zeros. Moreover the
implementation below was written so that it can easily be SIMD'ed.

*/

#define NSIMD_INSIDE
#include <nsimd/nsimd.h>

#ifdef NSIMD_NO_IEEE754
  #include <cmath>
#endif
#include <algorithm>

#ifdef NSIMD_C_LINKAGE_FOR_F16
extern "C" {
#endif

// ----------------------------------------------------------------------------
// Convert a FP16 as an u16 to a float

NSIMD_DLLEXPORT float nsimd_u16_to_f32(u16 a) {
#ifdef NSIMD_NO_IEEE754
  float sign;
  int exponent, mantissa;

  sign = (a >> 15) == 1 ? -1.0f : 1.0f;
  exponent = (a >> 10) & 0x1F;
  mantissa = (float)(a & 0x3FF);

  if (exponent == 0) {
    return std::ldexp(sign * mantissa, -24);
  } else {
    return std::ldexp(sign * (0x400 | mantissa), exponent - 25);
  }
#else
  u32 sign, mantissa, exponent;

  sign = a & 0x8000;
  exponent = (a >> 10) & 0x1F;
  mantissa = (a & 0x3FF);

  if (exponent == 31) {
    /* We have a NaN of an INF. */
    exponent = 255;
    /* Force the first bit of the mantissa to 1 to be compatible with the way
     * Intel convert f16 to f32 */
    if (mantissa != 0) {
      //mantissa |= 0x200;
    }
  } else if (exponent == 0 && mantissa == 0) {
    /* Nothing to do */
  } else if (exponent == 0) {
    u32 mask = mantissa;
    /* Find the most significant bit of the mantissa (could use a better
     * algorithm) */
    int i = -1;
    do {
      ++i;
      mask <<= 1;
    } while ((mask & 0x400) == 0);

    /* Update the mantissa and the exponent */
    mantissa = (mask & 0x3ff);
    exponent += (u32)(112 - i);
  } else {
    /* the exponent must be recomputed -15 + 127 */
    exponent += 112;
  }

  /* We then rebuild the float */
  return nsimd_scalar_reinterpret_f32_u32(
      (sign << 16) | (((u32)exponent) << 23) | (mantissa << 13));
#endif
}

// ----------------------------------------------------------------------------
// Convert a FP16 to a float

#ifndef NSIMD_NATIVE_FP16
NSIMD_DLLEXPORT f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(a.u); }
#endif

// ----------------------------------------------------------------------------
// Convert a float to a FP16 as an u16

NSIMD_DLLEXPORT u16 nsimd_f32_to_u16(f32 a) {
#ifdef NSIMD_NO_IEEE754
  double frac;
  int exponent;
  u32 sign, mantissa;

  /* Get mantissa (= fractional part) and exponent. */
  frac = std::frexp(a, &exponent);

  /* Get sign and make sure frac is positive. */
  if (frac < 0) {
    sign = 1u;
    frac = -frac;
  } else {
    sign = 0u;
  }

  /* Add 1 to the exponent to have the IEEE exponent: The mantissa here
     lives in [0.5, 1) whereas for IEEE it must live in [1, 2). */
  exponent++;

  if (exponent < -14) {
    /* We have a too small number, returns zero */
    return (u16)(sign << 15);
  } else if (exponent > 15) {
    /* We have a too big number, return INF */
    return (u16)((sign << 15) | 0x7C00);
  } else {
    /* We have a normal number. Get the mantissa:
       frac lives in [0.5, 1) and is of the form 0.1XXXXXXX, therefore
       to get the mantissa frac must be multiplied by 2^11 = 2048. Then
       it will be of the form 1XX XXXX XXXX.XXXXX, so we have to get rid
       of the leading bit. */
    mantissa = (u32)(frac * 2048.0) & 0x3FF;
    return (u16)((sign << 15) | ((u32)(exponent + 15) << 10) | mantissa);
  }
#else
  u32 sign, mantissa;
  int exponent;

  u32 in_u = nsimd_scalar_reinterpret_u32_f32(a);
  sign = in_u & 0x80000000;
  exponent = (int)((in_u >> 23) & 0xFF);
  mantissa = (in_u & 0x7FFFFF);

  if (exponent == 255 && mantissa != 0) {
    /* Nan */
    return (u16)(0xffff);
  }

  const f32 biggest_f16 = nsimd_scalar_reinterpret_f32_u32(0x477ff000);
  if (a >= biggest_f16 || a <= -biggest_f16) {
    /* Number is too big to be representable in half => return infinity */
    return (u16)(sign >> 16 | 0x1f << 10);
  }

  const f32 smallest_f16 = nsimd_scalar_reinterpret_f32_u32(0x33000000);
  if (a <= smallest_f16 && a >= -smallest_f16) {
    /* Number is too small to be representable in half => return ±0 */
    return (u16)(sign >> 16);
  }

  /* For FP32 exponent bias is 127, compute the real exponent. */
  exponent -= 127;

  /* Following algorithm taken from:
   * https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ */
  const f32 denormal_f16 = nsimd_scalar_reinterpret_f32_u32(0x38800000);
  if (a < denormal_f16 && a > -denormal_f16) {
    /* Denormalized half */
    const u32 magic_u = ((127 - 15) + (23 - 10) + 1) << 23;
    const f32 magic_f = nsimd_scalar_reinterpret_f32_u32(magic_u);

    u32 in_u = nsimd_scalar_reinterpret_u32_f32(a);
    in_u &= ~0x80000000u;
    f32 in_f = nsimd_scalar_reinterpret_f32_u32(in_u);
    in_f += magic_f;
    in_u = nsimd_scalar_reinterpret_u32_f32(in_f);
    in_u -= magic_u;

    return (u16)((sign >> 16) | in_u);
  }

  /* Normal half */
  in_u &= ~0x80000000U;
  u32 mant_odd = (in_u >> 13) & 1;
  in_u += ((u32)(15 - 127) << 23) + 0xfffU;
  in_u += mant_odd;

  return (u16)((sign >> 16) | (in_u >> 13));
#endif
}

// ----------------------------------------------------------------------------
// Convert a float to a FP16

#ifndef NSIMD_NATIVE_FP16
NSIMD_DLLEXPORT f16 nsimd_f32_to_f16(f32 a) {
  f16 ret;
  ret.u = nsimd_f32_to_u16(a);
  return ret;
}
#endif

// ----------------------------------------------------------------------------

#ifdef NSIMD_C_LINKAGE_FOR_F16
} // extern "C"
#endif

// ----------------------------------------------------------------------------
// C++ versions in namespace nsimd

namespace nsimd {

NSIMD_DLLEXPORT u16 f32_to_u16(f32 a) { return nsimd_f32_to_u16(a); }
NSIMD_DLLEXPORT f32 u16_to_f32(u16 a) { return nsimd_u16_to_f32(a); }
#ifndef NSIMD_NATIVE_FP16
NSIMD_DLLEXPORT f16 f32_to_f16(f32 a) { return nsimd_f32_to_f16(a); }
NSIMD_DLLEXPORT f32 f16_to_f32(f16 a) { return nsimd_f16_to_f32(a); }
#endif

} // namespace nsimd


================================================
FILE: src/gpu.cpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#define NSIMD_INSIDE
#include <nsimd/nsimd.h>

#if defined(NSIMD_ONEAPI) && NSIMD_CXX > 0

// ----------------------------------------------------------------------------
// oneAPI

// NSIMD error handler
namespace nsimd {
namespace oneapi {
template <typename Exception = sycl::exception>
struct sycl_async_error_handler {
  void operator()(const sycl::exception_list &elist) {
    for (const auto &exc : elist) {
      try {
        std::rethrow_exception(exc);
      } catch (const Exception &exc) {
        fprintf(stderr, "NSIMD Internal error:\n\tError: %s %s %d\n",
                exc.what(), __FILE__, __LINE__);
        exit(EXIT_FAILURE);
      }
    }
  }
};
} // namespace oneapi
} // namespace nsimd

extern "C" {

// Singleton to get default oneAPI queue
NSIMD_DLLSPEC void *nsimd_oneapi_default_queue() {
  static sycl::queue ret(sycl::default_selector{},
                         nsimd::oneapi::sycl_async_error_handler<>{});
  return (void *)&ret;
}

NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,
                                           nsimd_nat block_size) {
  return block_size * ((nb_items + block_size - 1) / block_size);
}

} // extern "C"

#elif defined(NSIMD_CUDA) || defined(NSIMD_ROCM)

// ----------------------------------------------------------------------------
// CUDA/ROCm

NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,
                                           nsimd_nat block_size) {
  return (nb_items + block_size - 1) / block_size;
}

#else

// ----------------------------------------------------------------------------
// CPU/SIMD

NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,
                                           nsimd_nat block_size) {
  return nb_items / block_size;
}

// ----------------------------------------------------------------------------

#endif


================================================
FILE: src/helperadvsimd.h
================================================
/*********************************************************************/
/*          Copyright ARM Ltd. 2010 - 2019.                          */
/* Distributed under the Boost Software License, Version 1.0.        */
/*    (See accompanying file LICENSE.txt or copy at                  */
/*          http://www.boost.org/LICENSE_1_0.txt)                    */
/*********************************************************************/

#ifndef __ARM_NEON
#error Please specify advsimd flags.
#endif

#if !defined(SLEEF_GENHEADER)
#include <arm_neon.h>
#include <stdint.h>

#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP 2
//@#define LOG2VECTLENSP 2
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)

#if CONFIG == 1
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif

#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

#define ISANAME "AArch64 AdvSIMD"

// Mask definition
typedef uint32x4_t vmask;
typedef uint32x4_t vopmask;

// Single precision definitions
typedef float32x4_t vfloat;
typedef int32x4_t vint2;

// Double precision definitions
typedef float64x2_t vdouble;
typedef int32x2_t vint;

typedef struct {
  vmask x, y;
} vmask2;

#define DFTPRIORITY 10

static INLINE int vavailability_i(int name) { return 3; }
static INLINE void vprefetch_v_p(const void *ptr) { }

static INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) {
  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmin_u32(x0, x0);
  return vget_lane_u32(x1, 0);
}

static INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) {
  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmin_u32(x0, x0);
  return vget_lane_u32(x1, 0);
}

// Vector load / store
static INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
static INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
static INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
static INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
static INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
static INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
static INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
static INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
static INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
static INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }

static INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
}

static INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  return ((vfloat) {
      ptr[vgetq_lane_s32(vi2, 0)],
      ptr[vgetq_lane_s32(vi2, 1)],
      ptr[vgetq_lane_s32(vi2, 2)],
      ptr[vgetq_lane_s32(vi2, 3)]
    });
}

// Basic logical operations for mask
static INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) {
  return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }

// Mask <--> single precision reinterpret
static INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) {
  return vreinterpretq_u32_f32(vf);
}
static INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) {
  return vreinterpretq_f32_u32(vm);
}
static INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
static INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }

// Mask <--> double precision reinterpret
static INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) {
  return vreinterpretq_u32_f64(vd);
}
static INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) {
  return vreinterpretq_f64_u32(vm);
}
static INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) {
  return vreinterpretq_f32_s32(vm);
}
static INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) {
  return vreinterpretq_s32_f32(vf);
}
static INLINE VECTOR_CC vint2 vreinterpret_vi2_vd(vdouble vd) {
  return vreinterpretq_s32_f64(vd);
}

/****************************************/
/* Single precision FP operations */
/****************************************/
// Broadcast
static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }

// Add, Sub, Mul
static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
  return vaddq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
  return vsubq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
  return vmulq_f32(x, y);
}

// |x|, -x
static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
static INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }

#if CONFIG == 1
// Multiply accumulate: z = z + x * y
static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return vfmaq_f32(z, x, y);
}
// Multiply subtract: z = z - x * y
static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return vfmsq_f32(z, x, y);
}
// Multiply subtract: z = x * y - z
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return vneg_vf_vf(vfmsq_f32(z, x, y));
}
#else
static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif

static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
  return vfmaq_f32(z, x, y);
}

static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
  return vfmsq_f32(z, x, y);
}

static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
  return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));
}

// Reciprocal 1/x, Division, Square root
static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
#ifndef ENABLE_ALTDIV
  return vdivq_f32(n, d);
#else
  // Finite numbers (including denormal) only, gives mostly correctly rounded result
  float32x4_t t, u, x, y;
  uint32x4_t i0, i1;
  i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
  i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
  i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
  t = vreinterpretq_f32_u32(i0);
  y = vmulq_f32(d, t);
  x = vmulq_f32(n, t);
  t = vrecpeq_f32(y);
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  u = vmulq_f32(x, t);
  u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
  return u;
#endif
}
static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
#ifndef ENABLE_ALTDIV
  return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
#else
  return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
		   vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
#endif
}

static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
#ifndef ENABLE_ALTSQRT
  return vsqrtq_f32(d);
#else
  // Gives correctly rounded result for all input range
  vfloat w, x, y, z;

  y = vrsqrteq_f32(d);
  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);

  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
  w = vmul_vf_vf_vf(w, y);
  x = vmul_vf_vf_vf(w, d);
  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
  w = vfma_vf_vf_vf_vf(w, z, y);
  w = vadd_vf_vf_vf(w, x);

  return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
			     vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
#endif
}

// max, min
static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
  return vmaxq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
  return vminq_f32(x, y);
}

// Comparisons
static INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
static INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
  return vmvnq_u32(vceqq_f32(x, y));
}
static INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
static INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
static INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
static INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }

// Conditional select
static INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
  return vbslq_f32(mask, x, y);
}

// int <--> float conversions
static INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
static INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
static INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
static INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) {
  return vcvtq_s32_f32(vrndnq_f32(d));
}

/***************************************/
/* Single precision integer operations */
/***************************************/

// Add, Sub, Neg (-x)
static INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vaddq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vsubq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }

// Logical operations
static INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vandq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vbicq_s32(y, x);
}
static INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vorrq_s32(x, y);
}
static INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
  return veorq_s32(x, y);
}

// Shifts
#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
#define vsrl_vi2_vi2_i(x, c)                                                   \
  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))

#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
//@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
//@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
#define vsrl_vi_vi_i(x, c)                                                     \
  vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
//@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))

// Comparison returning masks
static INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
static INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
// Comparison returning integers
static INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vreinterpretq_s32_u32(vcgeq_s32(x, y));
}
static INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  return vreinterpretq_s32_u32(vceqq_s32(x, y));
}

// Conditional select
static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
  return vbslq_s32(m, x, y);
}

/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------- */

/****************************************/
/* Double precision FP operations */
/****************************************/
// Broadcast
static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }

// Add, Sub, Mul
static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
  return vaddq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
  return vsubq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
  return vmulq_f64(x, y);
}

// |x|, -x
static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
static INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }

// max, min
static INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
  return vmaxq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
  return vminq_f64(x, y);
}

#if CONFIG == 1
// Multiply accumulate: z = z + x * y
static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return vfmaq_f64(z, x, y);
}

static INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return vfmsq_f64(z, x, y);
}

//[z = x * y - z]
static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return vneg_vd_vd(vfmsq_f64(z, x, y));
}
#else
static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
#endif

static INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
  return vfmaq_f64(z, x, y);
}

static INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
  return vfmsq_f64(z, x, y);
}

static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
  return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z));
}

// Reciprocal 1/x, Division, Square root
static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
#ifndef ENABLE_ALTDIV
  return vdivq_f64(n, d);
#else
  // Finite numbers (including denormal) only, gives mostly correctly rounded result
  float64x2_t t, u, x, y;
  uint64x2_t i0, i1;
  i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
  i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
  i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
  t = vreinterpretq_f64_u64(i0);
  y = vmulq_f64(d, t);
  x = vmulq_f64(n, t);
  t = vrecpeq_f64(y);
  t = vmulq_f64(t, vrecpsq_f64(y, t));
  t = vmulq_f64(t, vrecpsq_f64(y, t));
  t = vmulq_f64(t, vrecpsq_f64(y, t));
  u = vmulq_f64(x, t);
  u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
  return u;
#endif
}
static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
#ifndef ENABLE_ALTDIV
  return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
#else
  return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
		   vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
#endif
}

static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
#ifndef ENABLE_ALTSQRT
  return vsqrtq_f64(d);
#else
  // Gives correctly rounded result for all input range
  vdouble w, x, y, z;

  y = vrsqrteq_f64(d);
  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);

  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
  w = vmul_vd_vd_vd(w, y);
  x = vmul_vd_vd_vd(w, d);
  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
  w = vfma_vd_vd_vd_vd(w, z, y);
  w = vadd_vd_vd_vd(w, x);

  return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
			     vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
#endif
}

/* Comparisons */
static INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vceqq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));
}
static INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcltq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcgtq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcleq_f64(x, y));
}
static INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
  return vreinterpretq_u32_u64(vcgeq_f64(x, y));
}

// Conditional select
static INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
  return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);
}

#if 1
static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}

static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}

static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
#else
// This implementation is slower on the current CPU models (as of May 2017.)
// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.
static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
			    (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });
  
  uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };
  return (vdouble) vqtbl1q_u8(tab, idx);
}

static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
			    vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },
				     vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },
					      (uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));
  
  uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } }; 
  return (vdouble) vqtbl2q_u8(tab, idx);
}

static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
}
#endif

static INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
static INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }

/****************************************/
/* int <--> float conversions           */
/****************************************/
static INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) {
  return vmovn_s64(vcvtq_s64_f64(vf));
}
static INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) {
  return vcvtq_f64_s64(vmovl_s32(vi));
}
static INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); }
static INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) {
  return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d)));
}

/***************************************/
/* Integer operations */
/***************************************/

// Add, Sub, Neg (-x)
static INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
static INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
static INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); }

// Logical operations
static INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
static INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
static INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
static INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }

// Comparison returning masks
static INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) {
  return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));
}

// Conditional select
static INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
  return vbsl_s32(vget_low_u32(m), x, y);
}

/***************************************/
/* Predicates                          */
/***************************************/
static INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) {
  const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY);
  const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY);
  uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));
  return vreinterpretq_u32_u64(cmp);
}

static INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) {
  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));
}

static INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY)));
}

static INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) {
  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY)));
}

static INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
  return vbslq_f32(mask, x, y);
}

static INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
  return vceqq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
  return vmvnq_u32(vceqq_f32(x, y));
}
static INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
  return vcltq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
  return vcleq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
  return vcgtq_f32(x, y);
}
static INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
  return vcgeq_f32(x, y);
}

static INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
  return vceqq_s32(x, y);
}
static INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
  return vcgtq_s32(x, y);
}
static INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) {
  return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0));
}
static INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) {
  return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf));
}
static INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) {
  return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf));
}
static INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) {
  return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf));
}
static INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

static INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) {
  return vuzpq_u32(m, m).val[0];
}
static INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) {
  return vzipq_u32(m, m).val[0];
}

static INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
  return vandq_u32(x, y);
}
static INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
  return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
  return vorrq_u32(x, y);
}
static INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
  return veorq_u32(x, y);
}

static INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return vbslq_s32(m, x, y);
}
static INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
  return vandq_s32(vreinterpretq_s32_u32(x), y);
}
static INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
  return vbicq_s32(y, vreinterpretq_s32_u32(x));
}
static INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) {
  return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));
}
static INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) {
  return vandq_u32(x, y);
}
static INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) {
  return vandq_u32(x, y);
}
static INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
  return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
  return vbicq_u32(y, x);
}
static INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) {
  return vorrq_u32(x, y);
}
static INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) {
  return vorrq_u32(x, y);
}
static INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
  return veorq_u32(x, y);
}

static INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }

static INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) {
  return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
}

static INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}

static INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}

static INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  return vbsl_s32(vget_low_u32(m), x, y);
}

// Logical operations
static INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) {
  return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);
}

static INLINE VECTOR_CC vint2 vcastu_vi2_vi(vint vi) {
  return vreinterpretq_s32_u32(vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)))));
}
static INLINE VECTOR_CC vint vcastu_vi_vi2(vint2 vi2) {
  return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_s32(vi2)))));
}
static INLINE VECTOR_CC vdouble vreinterpret_vd_vi2(vint2 vi) {
  return vreinterpretq_f64_s32(vi);
}
static INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }

//

#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })

static INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
static INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }

static INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }

static INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }
static INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; }

static INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
static INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }

static INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
static INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
static INLINE VECTOR_CC vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

static INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }

static INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}

static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}

//

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
  return (vmask2) {
    vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))),
    vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };
}

static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
  return (vmask2) {
    vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))),
    vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };
}

static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }

static vmask2 vloadu_vm2_p(void *p) {
  vmask2 vm2;
  memcpy(&vm2, p, VECTLENDP * 16);
  return vm2;
}

#if !defined(SLEEF_GENHEADER)
typedef Sleef_quad2 vargquad;

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
}

static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
  vm2 = vuninterleave_vm2_vm2(vm2);
  vargquad aq;
  memcpy(&aq, &vm2, VECTLENDP * 16);
  return aq;
}
#endif // #if !defined(SLEEF_GENHEADER)

static INLINE int vtestallzeros_i_vo64(vopmask g) {
  uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmax_u32(x0, x0);
  return ~vget_lane_u32(x1, 0);
}

static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }

static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}

static INLINE vmask vneg64_vm_vm(vmask x) {
  return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));
}

static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
}

#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
//@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
//@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))

static INLINE vmask vcast_vm_vi(vint vi) {
  vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
  return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi))))), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }


================================================
FILE: src/helperavx.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if CONFIG == 1

#if !defined(__AVX__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx.
#endif

#elif CONFIG == 4

#if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx and -mfma4.
#endif

#else
#error CONFIG macro invalid or not defined
#endif

#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 2
//@#define LOG2VECTLENDP 2
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)

#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

typedef __m256i vmask;
typedef __m256i vopmask;

typedef __m256d vdouble;
typedef __m128i vint;

typedef __m256 vfloat;
typedef struct { __m128i x, y; } vint2;

typedef struct {
  vmask x, y;
} vmask2;

//

#if !defined(SLEEF_GENHEADER)

#ifndef __SLEEF_H__
static inline
                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,
                                           uint32_t ecx) {
                         /* We don't care for cpuid detection */
                         out[0] = 0xFFFFFFFF;
                         out[1] = 0xFFFFFFFF;
                         out[2] = 0xFFFFFFFF;
                         out[3] = 0xFFFFFFFF;
                       }
                       #endif

static INLINE int cpuSupportsAVX() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 28)) != 0;
}

static INLINE int cpuSupportsFMA4() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 0x80000001, 0);
    return (reg[2] & (1 << 16)) != 0;
}

#if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__)
static INLINE int vavailability_i(int name) {
  //int d = __builtin_cpu_supports("avx") && __builtin_cpu_supports("fma4");
  int d = cpuSupportsAVX() && cpuSupportsFMA4();
  return d ? 3 : 0;
}

//typedef vint2 vint2_fma4;

#define ENABLE_FMA_DP
#define ENABLE_FMA_SP

#define ISANAME "AVX + AMD FMA4"
#define DFTPRIORITY 21
#else
static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX();
  return d ? 3 : 0;
}
//typedef vint2 vint2_avx;

#define ISANAME "AVX"
#define DFTPRIORITY 20
#endif

#endif // #if !defined(SLEEF_GENHEADER)

static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }

static INLINE int vtestallones_i_vo32(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}

static INLINE int vtestallones_i_vo64(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}

//

static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) {
  vint2 r;
  r.x = _mm256_castsi256_si128(vreinterpret_vm_vd(vd));
  r.y = _mm256_extractf128_si256(vreinterpret_vm_vd(vd), 1);
  return r;
}
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) {
  vmask m = _mm256_castsi128_si256(vi.x);
  m = _mm256_insertf128_si256(m, vi.y, 1);
  return vreinterpret_vd_vm(m);
}

//

static vint2 vloadu_vi2_p(int32_t *p) {
  vint2 r;
  r.x = _mm_loadu_si128((__m128i *) p     );
  r.y = _mm_loadu_si128((__m128i *)(p + 4));
  return r;
}

static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
  _mm_storeu_si128((__m128i *) p     , v.x);
  _mm_storeu_si128((__m128i *)(p + 4), v.y);  
}

static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }

//

static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vopmask vcast_vo32_vo64(vopmask o) {
  return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));
}

static INLINE vopmask vcast_vo64_vo32(vopmask o) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));
}

//

static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
static INLINE vint2 vcastu_vi2_vi(vint vi) {
  vint2 r;
  r.x = _mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0));
  r.y = _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0));
  return r;
}

static INLINE vint vcastu_vi_vi2(vint2 vi) {
  return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(vi.x, 0x0d), _mm_set_epi32( 0,  0, -1, -1)),
		      _mm_and_si128(_mm_shuffle_epi32(vi.y, 0xd0), _mm_set_epi32(-1, -1,  0,  0)));
}

static INLINE vmask vcast_vm_i_i(int i0, int i1) {
  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
}

static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));
}

//

static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }

#if CONFIG == 1
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }
#endif

static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }

//

static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }

static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }

static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }

static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }

static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }

static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }

static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }

static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }

static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}

static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}

static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}

static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}

static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
}

static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
}

static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  int a[VECTLENDP];
  vstoreu_v_p_vi(a, vi);
  return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
  double a[VECTLENDP];
  vstoreu_v_p_vd(a, v);
  return a[0];
}
#endif

//

static INLINE vint2 vcast_vi2_vm(vmask vm) {
  vint2 r;
  r.x = _mm256_castsi256_si128(vm);
  r.y = _mm256_extractf128_si256(vm, 1);
  return r;
}

static INLINE vmask vcast_vm_vi2(vint2 vi) {
  vmask m = _mm256_castsi128_si256(vi.x);
  m = _mm256_insertf128_si256(m, vi.y, 1);
  return m;
}

static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }

static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }

#if CONFIG == 1
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }
#endif

static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }

static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };
  return vi;
}

static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };
  return vi;
}

static INLINE vint2 vneg_vi2_vi2(vint2 e) {
  vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };
  return vi;
}

static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };
  return vi;
}

static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };
  return vi;
}

static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };
  return vi;
}

static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };
  return vi;
}

static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }

static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
  vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };
  return vi;
}

static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
  vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };
  return vi;
}

static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
  vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };
  return vi;
}

static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpeq_epi32(x.x, y.x);
  r.y = _mm_cmpeq_epi32(x.y, y.y);
  return vcast_vm_vi2(r);
}

static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpgt_epi32(x.x, y.x);
  r.y = _mm_cmpgt_epi32(x.y, y.y);
  return vcast_vm_vi2(r);
}

static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpeq_epi32(x.x, y.x);
  r.y = _mm_cmpeq_epi32(x.y, y.y);
  return r;
}

static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  vint2 r;
  r.x = _mm_cmpgt_epi32(x.x, y.x);
  r.y = _mm_cmpgt_epi32(x.y, y.y);
  return r;
}

static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  vint2 n = vcast_vi2_vm(m);
  vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };
  return r;
}

static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
  vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;
  iz.x = _mm_add_epi64(ix.x, iy.x);
  iz.y = _mm_add_epi64(ix.y, iy.y);
  return vcast_vm_vi2(iz);
}

static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }

static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

//

static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  int a[VECTLENSP];
  vstoreu_v_p_vi2(a, vi2);
  return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
		       ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
  float a[VECTLENSP];
  vstoreu_v_p_vf(a, v);
  return a[0];
}
#endif
//

#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })

static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }

#if CONFIG == 1
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#else
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
#endif


static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }

static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}

static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}

//

static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
}

static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

//

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
  return (vmask2) {
    vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))),
      vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };
}

static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
  return (vmask2) {
    vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))),
      vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
  return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
  double tmp[4];
  vstoreu_v_p_vd(tmp, vd);
  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
  return vloadu_vd_p(tmp);
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
  double tmp[4];
  vstoreu_v_p_vd(tmp, vd);
  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
  return vloadu_vd_p(tmp);
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
  double tmp[4];
  vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));
  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
  return vreinterpret_vm_vd(vloadu_vd_p(tmp));
}

static INLINE vmask vuninterleave_vm_vm(vmask vm) {
  double tmp[4];
  vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));
  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;
  return vreinterpret_vm_vd(vloadu_vd_p(tmp));
}

static vmask2 vloadu_vm2_p(void *p) {
  vmask2 vm2;
  memcpy(&vm2, p, VECTLENDP * 16);
  return vm2;
}

#if !defined(SLEEF_GENHEADER)
typedef Sleef_quad4 vargquad;

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
}

static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
  vm2 = vuninterleave_vm2_vm2(vm2);
  vargquad aq;
  memcpy(&aq, &vm2, VECTLENDP * 16);
  return aq;
}
#endif // #if !defined(SLEEF_GENHEADER)

static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
}

static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));
}

static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
  vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));
  return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);
}

static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
  vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));
  return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);
}

#define vsll64_vm_vm_i(x, c) \
  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \
			  _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
#define vsrl64_vm_vm_i(x, c) \
  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
			  _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)

//@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
//@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)

static INLINE vmask vcast_vm_vi(vint vi) {
  vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
  vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
  vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
  return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1)))), m);
}
static INLINE vint vcast_vi_vm(vmask vm) {
  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
  		      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
}


================================================
FILE: src/helperavx2.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if CONFIG == 1

#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx2.
#endif

#else
#error CONFIG macro invalid or not defined
#endif

#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 2
//@#define LOG2VECTLENDP 2
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP

#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

typedef __m256i vmask;
typedef __m256i vopmask;

typedef __m256d vdouble;
typedef __m128i vint;

typedef __m256 vfloat;
typedef __m256i vint2;

typedef struct {
  vmask x, y;
} vmask2;

//

#if !defined(SLEEF_GENHEADER)

#ifndef __SLEEF_H__
static inline
                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,
                                           uint32_t ecx) {
                         /* We don't care for cpuid detection */
                         out[0] = 0xFFFFFFFF;
                         out[1] = 0xFFFFFFFF;
                         out[2] = 0xFFFFFFFF;
                         out[3] = 0xFFFFFFFF;
                       }
                       #endif

static INLINE int cpuSupportsAVX2() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 5)) != 0;
}

static INLINE int cpuSupportsFMA() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 12)) != 0;
}

#if CONFIG == 1 && defined(__AVX2__)
static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX2() && cpuSupportsFMA();
  return d ? 3 : 0;
}
#define ISANAME "AVX2"
#define DFTPRIORITY 25
#endif

#endif // #if !defined(SLEEF_GENHEADER)

static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }

static INLINE int vtestallones_i_vo32(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}

static INLINE int vtestallones_i_vo64(vopmask g) {
  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
}

//

static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm256_castsi256_pd(vi); }

//

static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }

//

static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }

static INLINE vopmask vcast_vo32_vo64(vopmask o) {
  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));
}

static INLINE vopmask vcast_vo64_vo32(vopmask o) {
  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
}

//

static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }

static INLINE vint2 vcastu_vi2_vi(vint vi) {
  return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);
}

static INLINE vint vcastu_vi_vi2(vint2 vi) {
  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
  		      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
}

static INLINE vmask vcast_vm_i_i(int i0, int i1) {
  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
}

static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }

//

static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }

static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }

static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }

//

static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }

static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }

static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }

static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }

static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }

static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }

static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }

static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }

static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  __m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),
						   vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),
								    vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),
										     _mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));
  return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));
}

static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
}

static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}

static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
}

static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
}

static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
}

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
  double s[4];
  _mm256_storeu_pd(s, v);
  return s[0];
}
#endif

static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }

//

static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }

static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }

static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }

static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }

static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }

static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }

static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }

static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }

static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }

static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }

static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return _mm256_blendv_epi8(y, x, m);
}

static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }

// At this point, the following three functions are implemented in a generic way,
// but I will try target-specific optimization later on.
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
  float s[8];
  _mm256_storeu_ps(s, v);
  return s[0];
}
#endif

static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }

//

#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })

static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }

static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }

static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }

static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}

static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
}

//

static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
}

static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

//

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
  return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) };
}

static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
  return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
  return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
  return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
  return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
  return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));
}

static INLINE vmask vuninterleave_vm_vm(vmask vm) {
  return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));
}

static vmask2 vloadu_vm2_p(void *p) {
  vmask2 vm2;
  memcpy(&vm2, p, VECTLENDP * 16);
  return vm2;
}

#if !defined(SLEEF_GENHEADER)
typedef Sleef_quad4 vargquad;

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
}

static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
  vm2 = vuninterleave_vm2_vm2(vm2);
  vargquad aq;
  memcpy(&aq, &vm2, VECTLENDP * 16);
  return aq;
}
#endif // #if !defined(SLEEF_GENHEADER)

static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
}

static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); }

static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare

#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)

static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); }
static INLINE vint vcast_vi_vm(vmask vm) {
  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
  		      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
}


================================================
FILE: src/helperavx512f.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if CONFIG == 1 || CONFIG == 2

#if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER)
#error Please specify -mavx512f.
#endif

#else
#error CONFIG macro invalid or not defined
#endif

#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 3
//@#define LOG2VECTLENDP 3
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)

#if CONFIG == 1
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif

#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

typedef __m512i vmask;
typedef __mmask16 vopmask;

typedef __m512d vdouble;
typedef __m256i vint;

typedef __m512 vfloat;
typedef __m512i vint2;

typedef struct {
  vmask x, y;
} vmask2;

//

#if !defined(SLEEF_GENHEADER)

#ifndef __SLEEF_H__
static inline
                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,
                                           uint32_t ecx) {
                         /* We don't care for cpuid detection */
                         out[0] = 0xFFFFFFFF;
                         out[1] = 0xFFFFFFFF;
                         out[2] = 0xFFFFFFFF;
                         out[3] = 0xFFFFFFFF;
                       }
                       #endif

static INLINE int cpuSupportsAVX512F() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 7, 0);
    return (reg[1] & (1 << 16)) != 0;
}

#if CONFIG == 1 && defined(__AVX512F__)
static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX512F();
  return d ? 3 : 0;
}
#define ISANAME "AVX512F"
#define DFTPRIORITY 30
#endif

#if CONFIG == 2 && defined(__AVX512F__)
static INLINE int vavailability_i(int name) {
  int d = cpuSupportsAVX512F();
  return d ? 3 : 0;
}
#define ISANAME "AVX512FNOFMA"
#define DFTPRIORITY 0
#endif

#endif // #if !defined(SLEEF_GENHEADER)

static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }

#ifdef __INTEL_COMPILER
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }
#else
static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
#endif

//

static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); }
static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); }

//

static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }

static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }

static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }

static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }

static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }

//

static INLINE vint vrint_vi_vd(vdouble vd) {
  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
}

static INLINE vint vtruncate_vi_vd(vdouble vd) {
  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
}

static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }

static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
}

static INLINE vdouble vrint_vd_vd(vdouble vd) {
  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
}

static INLINE vint2 vcastu_vi2_vi(vint vi) {
  return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
}

static INLINE vint vcastu_vi_vi2(vint2 vi) {
  return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));
}

static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }

static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }

//

static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm512_castpd_si512(vd); }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm512_castsi512_pd(vi); }

static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }

#if CONFIG == 1
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
#endif

static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }

static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }

//

static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }

static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }

static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));
}
static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));
}

static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }
#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
//@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
//@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
//@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)

static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }

static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);
}
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);
}

static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
  return _mm512_mask_blend_pd(mask, y, x);
}

static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}

#if 1
// Probably this is faster
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  __m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),
						   vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),
								    vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),
										     _mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));
  return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));
}

static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
}
#else
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}

static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
#endif

static INLINE vopmask visinf_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
}

static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
}

static INLINE vopmask visminf_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ);
}

static INLINE vopmask visnan_vo_vd(vdouble d) {
  return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);
}

static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }

// vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to
// be a normalized FP value.
static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }

static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }
static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }

static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }

#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
//@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
//@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))

#if defined(_MSC_VER)
// This function is needed when debugging on MSVC.
static INLINE double vcast_d_vd(vdouble v) {
  double s[VECTLENDP];
  _mm512_storeu_pd(s, v);
  return s[0];
}
#endif

static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }

//

static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
  return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));
}

//

static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); }

static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); }
static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); }

static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
 
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); }

static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
}
 
static INLINE vfloat vrint_vf_vf(vfloat vd) {
  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
}

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }

#if CONFIG == 1
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif

static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }

static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }

static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }

static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);
}

static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
  return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));
}

#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
//@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
//@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
//@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }

static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  __mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
}
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  __mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);
  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
}

static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return _mm512_mask_blend_epi32(m, y, x);
}

static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
  return _mm512_mask_blend_ps(m, y, x);
}

// At this point, the following three functions are implemented in a generic way,
// but I will try target-specific optimization later on.
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }

#ifdef _MSC_VER
// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
  float s[VECTLENSP];
  _mm512_storeu_ps(s, v);
  return s[0];
}
#endif

static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }

//

static INLINE vdouble vposneg_vd_vd(vdouble d) {
  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
}
static INLINE vdouble vnegpos_vd_vd(vdouble d) {
  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
}
static INLINE vfloat vposneg_vf_vf(vfloat d) {
  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
}
static INLINE vfloat vnegpos_vf_vf(vfloat d) {
  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
}

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }

static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }

static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); }

static INLINE vdouble vreva2_vd_vd(vdouble vd) {
  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd)));
}

static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
  _mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
  _mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
}

static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
  _mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
  _mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
}

//

static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); }
static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

static INLINE vfloat vreva2_vf_vf(vfloat vf) {
  return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf)));
}

static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
  _mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
  _mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
}

static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

//

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
  return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) };
}

static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
  return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) };
}

static INLINE vint vuninterleave_vi_vi(vint v) {
  return _mm256_permutevar8x32_epi32(v, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vreinterpret_vm_vd(vd)));
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vreinterpret_vm_vd(vd)));
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
  return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vm);
}

static INLINE vmask vuninterleave_vm_vm(vmask vm) {
  return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vm);
}

static vmask2 vloadu_vm2_p(void *p) {
  vmask2 vm2;
  memcpy(&vm2, p, VECTLENDP * 16);
  return vm2;
}

#if !defined(SLEEF_GENHEADER)
typedef Sleef_quad8 vargquad;

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
}

static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
  vm2 = vuninterleave_vm2_vm2(vm2);
  vargquad aq;
  memcpy(&aq, &vm2, VECTLENDP * 16);
  return aq;
}
#endif // #if !defined(SLEEF_GENHEADER)

#ifdef __INTEL_COMPILER
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; }
#else
static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }
#endif

static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); }

static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); }
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare

#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)

static INLINE vmask vcast_vm_vi(vint vi) {
  return _mm512_cvtepi32_epi64(vi);
}
static INLINE vint vcast_vi_vm(vmask vm) {
  return _mm512_cvtepi64_epi32(vm);
}


================================================
FILE: src/helperneon32.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#ifndef __ARM_NEON
#error Please specify -mfpu=neon.
#endif

#ifdef __aarch64__
#warning This implementation is for AARCH32.
#endif

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP 2
//@#define LOG2VECTLENSP 2
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)

#if CONFIG == 4
#define ISANAME "AARCH32 NEON-VFPV4"
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#else
#define ISANAME "AARCH32 NEON"
#endif
#define DFTPRIORITY 10

#define ENABLE_RECSQRT_SP
//@#define ENABLE_RECSQRT_SP

#include <arm_neon.h>
#include <stdint.h>

#include "misc.h"

typedef uint32x4_t vmask;
typedef uint32x4_t vopmask;

//typedef int32x4_t vint;

typedef float32x4_t vfloat;
typedef int32x4_t vint2;

//

static INLINE void vprefetch_v_p(const void *ptr) { }

static INLINE int vtestallones_i_vo32(vopmask g) {
  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
  uint32x2_t x1 = vpmin_u32(x0, x0);
  return vget_lane_u32(x1, 0);
}

static vfloat vloaduf(float *p) { return vld1q_f32(p); }
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }

static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }

//

static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }

static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }

static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }

static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }

static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }

//

static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  uint32x4_t t = vceqq_u32(x, y);
  return vandq_u32(t, vrev64q_u32(t));
}

//

static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
static INLINE vint2 vrint_vi2_vf(vfloat d) {
  return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
}
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }

static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }

static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }

static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
#if CONFIG == 4
static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
static INLINE vfloat vfma_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }

static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) {
  float32x4_t t = vrecpeq_f32(y), u;
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
  u = vmulq_f32(x, t);
  return vfmaq_f32(u, vfmsq_f32(x, y, u), t);
}

static INLINE vfloat vsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  float32x4_t u = vmulq_f32(x, d);
  u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
}

static INLINE vfloat vrec_vf_vf(vfloat y) {
  float32x4_t t = vrecpeq_f32(y), u;
  t = vmulq_f32(t, vrecpsq_f32(y, t));
  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
  return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
}

static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
}
#else // #if CONFIG == 4
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); }

static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
  float32x4_t x = vrecpeq_f32(d);
  x = vmulq_f32(x, vrecpsq_f32(d, x));
  float32x4_t t = vmulq_f32(n, x);
  return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
}

static INLINE vfloat vsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  float32x4_t u = vmulq_f32(x, d);
  u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
}

static INLINE vfloat vrec_vf_vf(vfloat d) {
  float32x4_t x = vrecpeq_f32(d);
  x = vmulq_f32(x, vrecpsq_f32(d, x));
  return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
}

static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
  float32x4_t x = vrsqrteq_f32(d);
  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
  return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
}
#endif // #if CONFIG == 4
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }

static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }

static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }

static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }

static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }

#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)

static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); }

static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }

static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
  return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
}

static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

// This function is needed when debugging on MSVC.
static INLINE float vcast_f_vf(vfloat v) {
  float p[4];
  vst1q_f32 (p, v);
  return p[0];
}

static INLINE int vavailability_i(int name) {
  if (name != 2) return 0;
  return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
}


static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  return ((vfloat) {
      ptr[vgetq_lane_s32(vi2, 0)],
      ptr[vgetq_lane_s32(vi2, 1)],
      ptr[vgetq_lane_s32(vi2, 2)],
      ptr[vgetq_lane_s32(vi2, 3)]
    });
}

#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })

static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }

static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }

static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}

static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
}


================================================
FILE: src/helperpower_128.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if CONFIG == 1 || CONFIG == 2

#ifndef __VSX__
#error Please specify -mcpu=power8 or -mcpu=power9
#endif

#else
#error CONFIG macro invalid or not defined
#endif

#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)

#if CONFIG == 1
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#endif

#define ACCURATE_SQRT
//@#define ACCURATE_SQRT
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING

#if !defined(SLEEF_GENHEADER)
#include <altivec.h>
// undef altivec types since CPP and C99 use them as compiler tokens
// use __vector and __bool instead
#undef vector
#undef bool

#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

#define ISANAME "VSX"
#define DFTPRIORITY 25

static INLINE int vavailability_i(int name) { return 3; }
static INLINE void vprefetch_v_p(const void *ptr) { }

/**********************************************
 ** Types
***********************************************/
typedef __vector unsigned int vmask;
// using __bool with typedef may cause ambiguous errors
#define vopmask __vector __bool int
//@#define vopmask __vector __bool int
typedef __vector signed int vint;
typedef __vector signed int vint2;
typedef __vector float  vfloat;
typedef __vector double vdouble;

// internal use types
typedef __vector unsigned int v__u32;
typedef __vector unsigned char v__u8;
typedef __vector signed long long  v__i64;
typedef __vector unsigned long long  v__u64;
#define v__b64 __vector __bool long long

/**********************************************
 ** Utilities
***********************************************/
#define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1})
#define vset__vi2(...) ((vint2) {__VA_ARGS__})
#define vset__vm(...) ((vmask) {__VA_ARGS__})
#define vset__vo(...) ((vopmask) {__VA_ARGS__})
#define vset__vf(...) ((vfloat) {__VA_ARGS__})
#define vset__vd(...) ((vdouble) {__VA_ARGS__})
#define vset__u8(...) ((v__u8) {__VA_ARGS__})
#define vset__u32(...) ((v__u32) {__VA_ARGS__})
#define vset__s64(...) ((v__i64) {__VA_ARGS__})
#define vset__u64(...) ((v__u64) {__VA_ARGS__})

#define vsetall__vi(v)  vset__vi(v, v)
#define vsetall__vi2(v) vset__vi2(v, v, v, v)
#define vsetall__vm(v)  vset__vm(v, v, v, v)
#define vsetall__vo(v)  vset__vo(v, v, v, v)
#define vsetall__vf(v)  vset__vf(v, v, v, v)
#define vsetall__vd(v)  vset__vd(v, v)
#define vsetall__u8(v)  vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)
#define vsetall__u32(v) vset__u32(v, v, v, v)
#define vsetall__s64(v) vset__s64(v, v)
#define vsetall__u64(v) vset__u64(v, v)

#define vzero__vi()  vsetall__vi(0)
#define vzero__vi2() vsetall__vi2(0)
#define vzero__vm()  vsetall__vm(0)
#define vzero__vo()  vsetall__vo(0)
#define vzero__vf()  vsetall__vf(0)
#define vzero__vd()  vsetall__vd(0)
#define vzero__u8()  vsetall__u8(0)
#define vzero__u32() vsetall__u32(0)
#define vzero__s64() vsetall__s64(0)
#define vzero__u64() vsetall__u64(0)

//// Swap doubleword elements
#ifdef __clang__
  static INLINE v__u64 v__swapd_u64(v__u64 v)
  { return vec_xxpermdi(v, v, 2); }
#else
  static INLINE v__u64 v__swapd_u64(v__u64 v)
  {
    __asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v));
    return v;
  }
#endif

/**********************************************
 ** Memory
***********************************************/

////////////// Unaligned memory access //////////////
/**
 * It's not safe to use vector assignment via (cast & dereference) for unaligned memory access
 * with almost all clang versions and gcc8 when VSX3 isn't enabled,
 * these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x'
 * for more information check https://github.com/seiko2plus/vsx_mem_test
 *
 * TODO: check GCC(9, 10)
*/
//// load
#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
static vint vloadu_vi_p(const int32_t *ptr)
{ return *((vint*)ptr); }
static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
{ return *((vint2*)ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr)
{ return *((vfloat*)ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr)
{ return *((vdouble*)ptr); }
#else
static vint vloadu_vi_p(const int32_t *ptr)
{ return vec_vsx_ld(0, ptr); }
static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
{ return vec_vsx_ld(0, ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr)
{ return vec_vsx_ld(0, ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr)
{ return vec_vsx_ld(0, ptr); }
#endif

//// store
#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
static void vstoreu_v_p_vi(int32_t *ptr, vint v)
{ *((vint*)ptr) = v; }
static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
{ *((vint2*)ptr) = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
{ *((vfloat*)ptr) = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
{ *((vdouble*)ptr) = v; }
#else
static void vstoreu_v_p_vi(int32_t *ptr, vint v)
{ vec_vsx_st(v, 0, ptr); }
static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
{ vec_vsx_st(v, 0, ptr); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
{ vec_vsx_st(v, 0, ptr); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
{ vec_vsx_st(v, 0, ptr); }
#endif

////////////// aligned memory access //////////////
//// load
static INLINE vfloat vload_vf_p(const float *ptr)
{ return vec_ld(0, ptr); }
static INLINE vdouble vload_vd_p(const double *ptr)
{ return *((vdouble*)ptr); }

//// store
static INLINE void vstore_v_p_vf(float *ptr, vfloat v)
{ vec_st(v, 0, ptr); }
static INLINE void vstore_v_p_vd(double *ptr, vdouble v)
{ *((vdouble*)ptr) = v; }

////////////// non-temporal memory access //////////////
//// store
static INLINE void vstream_v_p_vf(float *ptr, vfloat v)
{ vstore_v_p_vf(ptr, v); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v)
{ vstore_v_p_vd(ptr, v); }

////////////// LUT //////////////
//// load
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi)
{ return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2)
{
  return vset__vf(
    ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)],
    ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)]
  );
}

//// store
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
{
  const v__u64 vll = (v__u64)v;
  float *ptr_low = ptr + offset*2;
  float *ptr_high = ptr + (offset + step)*2;
  *((uint64_t*)ptr_low) = vec_extract(vll, 0);
  *((uint64_t*)ptr_high) = vec_extract(vll, 1);
}

static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
{ vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
{ vstore_v_p_vd((double *)(&ptr[2*offset]), v); }

static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
{ vscatter2_v_p_i_i_vd(ptr, offset, step, v); }

/**********************************************
 ** Misc
 **********************************************/

// vector with a specific value set to all lanes (Vector Splat)
static INLINE vint vcast_vi_i(int i)
{ return vsetall__vi(i); }
static INLINE vint2 vcast_vi2_i(int i)
{ return vsetall__vi2(i); }
static INLINE vfloat vcast_vf_f(float f)
{ return vsetall__vf(f); }
static INLINE vdouble vcast_vd_d(double d)
{ return vsetall__vd(d); }
// cast
static INLINE vint2 vcast_vi2_vm(vmask vm)
{ return (vint2)vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi)
{ return (vmask)vi; }
// get the first element
static INLINE float vcast_f_vf(vfloat v)
{ return vec_extract(v, 0); }
static INLINE double vcast_d_vd(vdouble v)
{ return vec_extract(v, 0); }

static INLINE vmask vreinterpret_vm_vd(vdouble vd)
{ return (vmask)vd; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm)
{ return (vdouble)vm; }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd)
{ return (vint2)vd; }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi)
{ return (vdouble)vi; }

static INLINE vmask vreinterpret_vm_vf(vfloat vf)
{ return (vmask)vf; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm)
{ return (vfloat)vm; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi)
{ return (vfloat)vi; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf)
{ return (vint2)vf; }

// per element select via mask (blend)
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y)
{ return vec_sel(y, x, (v__b64)o); }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y)
{ return vec_sel(y, x, o); }

static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y)
{ return vec_sel(y, x, o); }

static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)
{ return vec_sel(y, x, o); }

static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0)
{
  return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2)
{
  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3)
{
  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0)
{
  return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2)
{
  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3)
{
  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}

static INLINE int vtestallones_i_vo32(vopmask g)
{ return vec_all_ne((vint2)g, vzero__vi2()); }
static INLINE int vtestallones_i_vo64(vopmask g)
{ return vec_all_ne((v__i64)g, vzero__s64()); }

/**********************************************
 ** Conversions
 **********************************************/

////////////// Numeric //////////////
// pack 64-bit mask to 32-bit
static INLINE vopmask vcast_vo32_vo64(vopmask m)
{ return (vopmask)vec_pack((v__u64)m, (v__u64)m); }
// clip 64-bit lanes to lower 32-bit
static INLINE vint vcastu_vi_vi2(vint2 vi2)
{ return vec_mergeo(vi2, vec_splat(vi2, 3)); }

// expand lower 32-bit mask
static INLINE vopmask vcast_vo64_vo32(vopmask m)
{ return vec_mergeh(m, m); }
// unsigned expand lower 32-bit integer
static INLINE vint2 vcastu_vi2_vi(vint vi)
{ return vec_mergeh(vzero__vi(), vi); }

// signed int to single-precision
static INLINE vfloat vcast_vf_vi2(vint2 vi)
{
  vfloat ret;
#ifdef __clang__
  ret = __builtin_convertvector(vi, vfloat);
#else
  __asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi));
#endif
  return ret;
}

// lower signed int to double-precision
static INLINE vdouble vcast_vd_vi(vint vi)
{
  vdouble ret;
  vint swap = vec_mergeh(vi, vi);
#ifdef __clang__
  ret = __builtin_vsx_xvcvsxwdp(swap);
#else
  __asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap));
#endif
  return ret;
}

// zip two scalars
static INLINE vmask vcast_vm_i_i(int l, int h)
{ return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); }

////////////// Truncation //////////////

static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
{
  vint2 ret;
#ifdef __clang__
  ret = __builtin_convertvector(vf, vint2);
#else
  __asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf));
#endif
  return ret;
}

static INLINE vint vtruncate_vi_vd(vdouble vd)
{
  vint ret;
#ifdef __clang__
  ret = __builtin_vsx_xvcvdpsxws(vd);
#else
  __asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd));
#endif
  return vec_mergeo(ret, vec_splat(ret, 3));
}

static INLINE vdouble vtruncate_vd_vd(vdouble vd)
{ return vec_trunc(vd); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf)
{ return vec_trunc(vf); }

////////////// Rounding //////////////

// towards the nearest even
static INLINE vint vrint_vi_vd(vdouble vd)
{ return vtruncate_vi_vd(vec_rint(vd)); }
static INLINE vint2 vrint_vi2_vf(vfloat vf)
{ return vtruncate_vi2_vf(vec_rint(vf)); }
static INLINE vdouble vrint_vd_vd(vdouble vd)
{ return vec_rint(vd); }
static INLINE vfloat vrint_vf_vf(vfloat vf)
{ return vec_rint(vf); }

/**********************************************
 ** Logical
 **********************************************/

////////////// And //////////////
static INLINE vint vand_vi_vi_vi(vint x, vint y)
{ return vec_and(x, y); }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y)
{ return vec_and((vint)x, y); }
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_and(x, y); }
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y)
{ return (vint2)vec_and((vint2)x, y); }

static INLINE vmask vand_vm_vm_vm(vmask x, vmask y)
{ return vec_and(x, y); }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)
{ return vec_and((vmask)x, y); }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)
{ return vec_and((vmask)x, y); }
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y)
{ return vec_and(x, y); }

////////////// Or //////////////
static INLINE vint vor_vi_vi_vi(vint x, vint y)
{ return vec_or(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_or(x, y); }

static INLINE vmask vor_vm_vm_vm(vmask x, vmask y)
{ return vec_or(x, y); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)
{ return vec_or((vmask)x, y); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)
{ return vec_or((vmask)x, y); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y)
{ return vec_or(x, y); }

////////////// Xor //////////////
static INLINE vint vxor_vi_vi_vi(vint x, vint y)
{ return vec_xor(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_xor(x, y); }

static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y)
{ return vec_xor(x, y); }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)
{ return vec_xor((vmask)x, y); }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)
{ return vec_xor((vmask)x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y)
{ return vec_xor(x, y); }

////////////// Not //////////////
static INLINE vopmask vnot_vo_vo(vopmask o)
{ return vec_nor(o, o); }

////////////// And Not ((~x) & y) //////////////
static INLINE vint vandnot_vi_vi_vi(vint x, vint y)
{ return vec_andc(y, x); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y)
{ return vec_andc(y, (vint)x); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_andc(y, x); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y)
{ return vec_andc(y, x); }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)
{ return vec_andc(y, x); }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)
{ return vec_andc(y, x); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y)
{ return vec_andc(y, x); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y)
{ return vec_andc(y, (vint2)x); }

/**********************************************
 ** Comparison
 **********************************************/

////////////// Equal //////////////
static INLINE vint veq_vi_vi_vi(vint x, vint y)
{ return (vint)vec_cmpeq(x, y); }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y)
{ return vec_cmpeq(x, y); }

static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y)
{ return vec_cmpeq(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y)
{ return (vint2)vec_cmpeq(x, y); }

static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y)
{ return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); }

static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmpeq(x, y); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmpeq(x, y); }

////////////// Not Equal //////////////
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y)
{ return vnot_vo_vo(vec_cmpeq(x, y)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y)
{ return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); }

////////////// Less Than //////////////
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmplt(x, y); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmplt(x, y); }

////////////// Greater Than //////////////
static INLINE vint vgt_vi_vi_vi(vint x, vint y)
{ return (vint)vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y)
{ return vec_cmpgt(x, y);}

static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y)
{ return (vint2)vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y)
{ return vec_cmpgt(x, y); }

static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmpgt(x, y); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmpgt(x, y); }

////////////// Less Than Or Equal //////////////
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmple(x, y); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmple(x, y); }

////////////// Greater Than Or Equal //////////////
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)
{ return vec_cmpge(x, y); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)
{ return (vopmask)vec_cmpge(x, y); }

////////////// Special Cases //////////////
static INLINE vopmask visinf_vo_vf(vfloat d)
{ return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); }
static INLINE vopmask visinf_vo_vd(vdouble d)
{ return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); }

static INLINE vopmask vispinf_vo_vf(vfloat d)
{ return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vd(vdouble d)
{ return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); }

static INLINE vopmask visminf_vo_vf(vfloat d)
{ return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vd(vdouble d)
{ return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); }

static INLINE vopmask visnan_vo_vf(vfloat d)
{ return vnot_vo_vo(vec_cmpeq(d, d)); }
static INLINE vopmask visnan_vo_vd(vdouble d)
{ return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); }

/**********************************************
 ** Shift
 **********************************************/
////////////// Left //////////////
static INLINE vint vsll_vi_vi_i(vint x, int c)
{ return vec_sl (x, vsetall__u32(c)); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c)
{ return vec_sl(x, vsetall__u32(c)); }

////////////// Right //////////////
static INLINE vint vsrl_vi_vi_i(vint x, int c)
{ return vec_sr(x, vsetall__u32(c)); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c)
{ return vec_sr(x, vsetall__u32(c)); }

////////////// Algebraic Right //////////////
static INLINE vint vsra_vi_vi_i(vint x, int c)
{ return vec_sra(x, vsetall__u32(c)); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c)
{ return vec_sra(x, vsetall__u32(c)); }

/**********************************************
 ** Reorder
 **********************************************/

////////////// Reverse //////////////
// Reverse elements order inside the lower and higher parts
static INLINE vint2 vrev21_vi2_vi2(vint2 vi)
{ return vec_mergee(vec_mergeo(vi, vi), vi); }
static INLINE vfloat vrev21_vf_vf(vfloat vf)
{ return (vfloat)vrev21_vi2_vi2((vint2)vf); }

// Swap the lower and higher parts
static INLINE vfloat vreva2_vf_vf(vfloat vf)
{ return (vfloat)v__swapd_u64((v__u64)vf); }
static INLINE vdouble vrev21_vd_vd(vdouble vd)
{ return (vdouble)v__swapd_u64((v__u64)vd); }
static INLINE vdouble vreva2_vd_vd(vdouble vd)
{ return vd; }

/**********************************************
 ** Arithmetic
 **********************************************/

////////////// Negation //////////////
static INLINE vint vneg_vi_vi(vint e) {
#ifdef __clang__
  return vec_neg(e);
#else
  return vec_sub(vzero__vi(), e);
#endif
}
static INLINE vint2 vneg_vi2_vi2(vint2 e)
{ return vneg_vi_vi(e); }

static INLINE vfloat vneg_vf_vf(vfloat d)
{
  vfloat ret;
#ifdef __clang__
  ret = vec_neg(d);
#else
  __asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d));
#endif
  return ret;
}

static INLINE vdouble vneg_vd_vd(vdouble d)
{
  vdouble ret;
#ifdef __clang__
  ret = vec_neg(d);
#else
  __asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d));
#endif
  return ret;
}

static INLINE vfloat vposneg_vf_vf(vfloat d)
{ return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); }
static INLINE vdouble vposneg_vd_vd(vdouble d)
{ return vec_xor(d, vset__vd(+0.0, -0.0)); }

static INLINE vfloat vnegpos_vf_vf(vfloat d)
{ return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); }
static INLINE vdouble vnegpos_vd_vd(vdouble d)
{ return vec_xor(d, vset__vd(-0.0, +0.0)); }

////////////// Addition //////////////
static INLINE vint vadd_vi_vi_vi(vint x, vint y)
{ return vec_add(x, y); }
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_add(x, y); }

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y)
{ return vec_add(x, y); }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y)
{ return vec_add(x, y); }

static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y)
{ return (vmask)vec_add((v__i64)x, (v__i64)y); }

////////////// Subtraction //////////////
static INLINE vint vsub_vi_vi_vi(vint x, vint y)
{ return vec_sub(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y)
{ return vec_sub(x, y); }

static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y)
{ return vec_sub(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y)
{ return vec_sub(x, y); }

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y)
{ return vec_add(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y)
{ return vec_add(x, vnegpos_vf_vf(y)); }

////////////// Multiplication //////////////
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y)
{ return vec_mul(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y)
{ return vec_mul(x, y); }

static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y)
{ return vec_div(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y)
{ return vec_div(x, y); }

static INLINE vfloat vrec_vf_vf(vfloat x)
{ return vec_div(vsetall__vf(1.0f), x); }
static INLINE vdouble vrec_vd_vd(vdouble x)
{ return vec_div(vsetall__vd(1.0), x); }

/**********************************************
 ** Math
 **********************************************/

static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y)
{ return vec_max(x, y); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y)
{ return vec_max(x, y); }

static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y)
{ return vec_min(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y)
{ return vec_min(x, y); }

static INLINE vfloat vabs_vf_vf(vfloat f)
{ return vec_abs(f); }
static INLINE vdouble vabs_vd_vd(vdouble d)
{ return vec_abs(d); }

static INLINE vfloat vsqrt_vf_vf(vfloat f)
{ return vec_sqrt(f); }
static INLINE vdouble vsqrt_vd_vd(vdouble d)
{ return vec_sqrt(d); }


/**********************************************
 ** FMA3
 **********************************************/
#if CONFIG == 1

static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_madd(x, y, z); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_madd(x, y, z); }

static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_msub(x, y, z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_msub(x, y, z); }

static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_nmsub(x, y, z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_nmsub(x, y, z); }

#else

static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_add(vec_mul(x, y), z); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_add(vec_mul(x, y), z); }

static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_sub(vec_mul(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_sub(vec_mul(x, y), z); }

static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_sub(z, vec_mul(x, y)); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_sub(z, vec_mul(x, y)); }

#endif

static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_madd(x, y, z); }
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_madd(x, y, z); }
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_madd(x, y, z); }
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_madd(x, y, z); }

static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_msub(x, y, z); }
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_msub(x, y, z); }

static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_nmsub(x, y, z); }
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_nmsub(x, y, z); }

static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vec_nmadd(x, y, z); }
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vec_nmadd(x, y, z); }

static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
{ return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
{ return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }


================================================
FILE: src/helpersse2.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#if CONFIG == 2

#if !defined(__SSE2__) && !defined(SLEEF_GENHEADER)
#error Please specify -msse2.
#endif

#elif CONFIG == 3

#if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER)
#error Please specify -msse2 and -msse3
#endif

#elif CONFIG == 4

#if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER)
#error Please specify -msse2, -msse3 and -msse4.1
#endif

#else
#error CONFIG macro invalid or not defined
#endif

#define ENABLE_DP
//@#define ENABLE_DP
#define LOG2VECTLENDP 1
//@#define LOG2VECTLENDP 1
#define VECTLENDP (1 << LOG2VECTLENDP)
//@#define VECTLENDP (1 << LOG2VECTLENDP)

#define ENABLE_SP
//@#define ENABLE_SP
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
//@#define VECTLENSP (1 << LOG2VECTLENSP)

#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

#if !defined(SLEEF_GENHEADER)
#if defined(_MSC_VER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

#include <stdint.h>
#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

typedef __m128i vmask;
typedef __m128i vopmask;

typedef __m128d vdouble;
typedef __m128i vint;

typedef __m128  vfloat;
typedef __m128i vint2;

typedef struct {
  vmask x, y;
} vmask2;

//

#if !defined(SLEEF_GENHEADER)

#ifndef __SLEEF_H__
static inline
                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,
                                           uint32_t ecx) {
                         /* We don't care for cpuid detection */
                         out[0] = 0xFFFFFFFF;
                         out[1] = 0xFFFFFFFF;
                         out[2] = 0xFFFFFFFF;
                         out[3] = 0xFFFFFFFF;
                       }
                       #endif

static INLINE int cpuSupportsSSE2() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[3] & (1 << 26)) != 0;
}

static INLINE int cpuSupportsSSE3() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 0)) != 0;
}

static INLINE int cpuSupportsSSE4_1() {
    int32_t reg[4];
    Sleef_x86CpuID(reg, 1, 0);
    return (reg[2] & (1 << 19)) != 0;
}

#if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__)
static INLINE int vavailability_i(int name) {
  //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3") && __builtin_cpu_supports("sse4.1");
  int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1();
  return d ? 3 : 0;
}
#define ISANAME "SSE4.1"
#define DFTPRIORITY 12
#elif defined(__SSE2__) && defined(__SSE3__)
static INLINE int vavailability_i(int name) {
  //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3");
  int d = cpuSupportsSSE2() && cpuSupportsSSE3();
  return d ? 3 : 0;
}
#define ISANAME "SSE3"
#define DFTPRIORITY 11
#else
static INLINE int vavailability_i(int name) {
  int d = cpuSupportsSSE2();
  return d ? 3 : 0;
}
#define ISANAME "SSE2"
#define DFTPRIORITY 10
#endif

#endif // #if !defined(SLEEF_GENHEADER)

static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }

static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }

//

static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }

static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }

//

static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }

static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }

static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }

static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }

static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }

//

static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }

#if CONFIG == 4
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#else
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  vmask t = _mm_cmpeq_epi32(x, y);
  return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));
}
#endif

static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }

static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }

//

static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm_castpd_si128(vd); }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm_castsi128_pd(vi); }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }

static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }

static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }

static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }

static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }

static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }

static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }

static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }

static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }

#if CONFIG == 4
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }

static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }
#else
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }

static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {
  return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y));
}
#endif

static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}

static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}

static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY)));
}

static INLINE vopmask vispinf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY)));
}

static INLINE vopmask visminf_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY)));
}

static INLINE vopmask visnan_vo_vd(vdouble d) {
  return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));
}

//

static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  int a[sizeof(vint)/sizeof(int)];
  vstoreu_v_p_vi(a, vi);
  return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
}

// This function is for debugging
static INLINE double vcast_d_vd(vdouble v) {
  double a[VECTLENDP];
  vstoreu_v_p_vd(a, v);
  return a[0];
}

//

static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }

#if CONFIG != 4
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }
#endif

static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }

static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }

static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }

static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }

static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }

static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }

static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }

#if CONFIG == 4
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }

static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }
#else
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));
}

static INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) {
  return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y));
}
#endif

static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}

static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }

static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
  int a[VECTLENSP];
  vstoreu_v_p_vi2(a, vi);
  return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
}

// This function is for debugging
static INLINE float vcast_f_vf(vfloat v) {
  float a[VECTLENSP];
  vstoreu_v_p_vf(a, v);
  return a[0];
}

//

#define PNMASK ((vdouble) { +0.0, -0.0 })
#define NPMASK ((vdouble) { -0.0, +0.0 })
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })

static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }

#if CONFIG >= 3
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
#else
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
#endif
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }

static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }

static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }

//

static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
}

static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
}

//

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
  return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) };
}

static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
  return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) };
}

static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }

static vmask2 vloadu_vm2_p(void *p) {
  vmask2 vm2;
  memcpy(&vm2, p, VECTLENDP * 16);
  return vm2;
}

#if !defined(SLEEF_GENHEADER)
typedef Sleef_quad2 vargquad;

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
}

static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
  vm2 = vuninterleave_vm2_vm2(vm2);
  vargquad aq;
  memcpy(&aq, &vm2, VECTLENDP * 16);
  return aq;
}
#endif // #if !defined(SLEEF_GENHEADER)

static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }

static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y));
}

static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }

#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)

static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  int64_t ax[2], ay[2];
  _mm_storeu_si128((__m128i *)ax, x);
  _mm_storeu_si128((__m128i *)ay, y);
  return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
}

static INLINE vmask vcast_vm_vi(vint vi) {
  vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
  return vor_vm_vm_vm(vcastu_vi2_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
}
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }


================================================
FILE: src/helpersve.h
================================================
/*********************************************************************/
/*          Copyright ARM Ltd. 2010 - 2019.                          */
/* Distributed under the Boost Software License, Version 1.0.        */
/*    (See accompanying file LICENSE.txt or copy at                  */
/*          http://www.boost.org/LICENSE_1_0.txt)                    */
/*********************************************************************/

#if !defined(__ARM_FEATURE_SVE) && !defined(SLEEF_GENHEADER)
#error Please specify SVE flags.
#endif

#if !defined(SLEEF_GENHEADER)
#include <arm_sve.h>
#include <stdint.h>

#include "misc.h"
#endif // #if !defined(SLEEF_GENHEADER)

#if defined(VECTLENDP) || defined(VECTLENSP)
#error VECTLENDP or VECTLENSP already defined
#endif

#if CONFIG == 1 || CONFIG == 2
// Vector length agnostic
#define VECTLENSP (svcntw())
//@#define VECTLENSP (svcntw())
#define VECTLENDP (svcntd())
//@#define VECTLENDP (svcntd())
#define ISANAME "AArch64 SVE"
#define ptrue svptrue_b8()
//@#define ptrue svptrue_b8()
#elif CONFIG == 8
// 256-bit vector length
#define ISANAME "AArch64 SVE 256-bit"
#define LOG2VECTLENDP 2
#define ptrue svptrue_pat_b8(SV_VL32)
#define DFTPRIORITY 20
#elif CONFIG == 9
// 512-bit vector length
#define ISANAME "AArch64 SVE 512-bit"
#define LOG2VECTLENDP 3
#define ptrue svptrue_pat_b8(SV_VL64)
#define DFTPRIORITY 21
#elif CONFIG == 10
// 1024-bit vector length
#define ISANAME "AArch64 SVE 1024-bit"
#define LOG2VECTLENDP 4
#define ptrue svptrue_pat_b8(SV_VL128)
#define DFTPRIORITY 22
#elif CONFIG == 11
// 2048-bit vector length
#define ISANAME "AArch64 SVE 2048-bit"
#define LOG2VECTLENDP 5
#define ptrue svptrue_pat_b8(SV_VL256)
#define DFTPRIORITY 23
#else
#error CONFIG macro invalid or not defined
#endif

#ifdef LOG2VECTLENDP
// For DFT, VECTLENDP and VECTLENSP are not the size of the available
// vector length, but the size of the partial vectors utilized in the
// computation. The appropriate VECTLENDP and VECTLENSP are chosen by
// the dispatcher according to the value of svcntd().

#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENDP (1 << LOG2VECTLENDP)
#define VECTLENSP (1 << LOG2VECTLENSP)
static INLINE int vavailability_i(int name) { return svcntd() >= VECTLENDP ? 3 : 0; }
#else
static INLINE int vavailability_i(int name) { return 3; }
#endif

#define ENABLE_SP
//@#define ENABLE_SP
#define ENABLE_DP
//@#define ENABLE_DP

#if CONFIG != 2
#define ENABLE_FMA_SP
//@#define ENABLE_FMA_SP
#define ENABLE_FMA_DP
//@#define ENABLE_FMA_DP
//#define SPLIT_KERNEL // Benchmark comparison is needed to determine whether this option should be enabled.
#endif

#define FULL_FP_ROUNDING
//@#define FULL_FP_ROUNDING
#define ACCURATE_SQRT
//@#define ACCURATE_SQRT

// Type definitions

// Mask definition
typedef svint32_t vmask;
typedef svbool_t vopmask;

// Single precision definitions
typedef svfloat32_t vfloat;
typedef svint32_t vint2;

// Double precision definitions
typedef svfloat64_t vdouble;
typedef svint32_t vint;

// Double-double data type with setter/getter functions
typedef svfloat64x2_t vdouble2;
static INLINE vdouble  vd2getx_vd_vd2(vdouble2 v) { return svget2_f64(v, 0); }
static INLINE vdouble  vd2gety_vd_vd2(vdouble2 v) { return svget2_f64(v, 1); }
static INLINE vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y)  { return svcreate2_f64(x, y); }
static INLINE vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 0, d); }
static INLINE vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 1, d); }

// Double-float data type with setter/getter functions
typedef svfloat32x2_t vfloat2;
static INLINE vfloat  vf2getx_vf_vf2(vfloat2 v) { return svget2_f32(v, 0); }
static INLINE vfloat  vf2gety_vf_vf2(vfloat2 v) { return svget2_f32(v, 1); }
static INLINE vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y)  { return svcreate2_f32(x, y); }
static INLINE vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 0, d); }
static INLINE vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 1, d); }

// vmask2 is mainly used for quad-precision functions
typedef svint32x2_t vmask2;
static INLINE vmask vm2getx_vm_vm2(vmask2 v) { return svget2_s32(v, 0); }
static INLINE vmask vm2gety_vm_vm2(vmask2 v) { return svget2_s32(v, 1); }
static INLINE vmask2 vm2setxy_vm2_vm_vm(vmask x, vmask y) { return svcreate2_s32(x, y); }
static INLINE vmask2 vm2setx_vm2_vm2_vm(vmask2 v, vmask x) { return svset2_s32(v, 0, x); }
static INLINE vmask2 vm2sety_vm2_vm2_vm(vmask2 v, vmask y) { return svset2_s32(v, 1, y); }

// Auxiliary data types

typedef svfloat64x2_t di_t;

static INLINE vdouble digetd_vd_di(di_t d) { return svget2_f64(d, 0); }
static INLINE vint digeti_vi_di(di_t d) { return svreinterpret_s32_f64(svget2_f64(d, 1)); }
static INLINE di_t disetdi_di_vd_vi(vdouble d, vint i) {
  return svcreate2_f64(d, svreinterpret_f64_s32(i));
}

//

typedef svfloat32x2_t fi_t;

static INLINE vfloat figetd_vf_di(fi_t d) { return svget2_f32(d, 0); }
static INLINE vint2 figeti_vi2_di(fi_t d) { return svreinterpret_s32_f32(svget2_f32(d, 1)); }
static INLINE fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) {
  return svcreate2_f32(d, svreinterpret_f32_s32(i));
}

//

typedef svfloat64x3_t ddi_t;

static INLINE vdouble2 ddigetdd_vd2_ddi(ddi_t d) {
  return svcreate2_f64(svget3_f64(d, 0), svget3_f64(d, 1));
}
static INLINE vint ddigeti_vi_ddi(ddi_t d) { return svreinterpret_s32_f64(svget3_f64(d, 2)); }
static INLINE ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {
  return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1),
		       svreinterpret_f64_s32(i));
}
static INLINE ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {
  return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1), svget3_f64(ddi, 2));
}

//

typedef svfloat32x3_t dfi_t;

static INLINE vfloat2 dfigetdf_vf2_dfi(dfi_t d) {
  return svcreate2_f32(svget3_f32(d, 0), svget3_f32(d, 1));
}
static INLINE vint2 dfigeti_vi2_dfi(dfi_t d) { return svreinterpret_s32_f32(svget3_f32(d, 2)); }
static INLINE dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) {
  return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1),
		       svreinterpret_f32_s32(i));
}
static INLINE dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) {
  return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1), svget3_f32(dfi, 2));
}

//

typedef svfloat64x4_t dd2;

static INLINE dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {
  return svcreate4_f64(svget2_f64(a, 0), svget2_f64(a, 1),
		       svget2_f64(b, 0), svget2_f64(b, 1));
}
static INLINE vdouble2 dd2geta_vd2_dd2(dd2 d) {
  return svcreate2_f64(svget4_f64(d, 0), svget4_f64(d, 1));
}
static INLINE vdouble2 dd2getb_vd2_dd2(dd2 d) {
  return svcreate2_f64(svget4_f64(d, 2), svget4_f64(d, 3));
}

//

typedef svfloat32x4_t df2;

static INLINE df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) {
  return svcreate4_f32(svget2_f32(a, 0), svget2_f32(a, 1),
		       svget2_f32(b, 0), svget2_f32(b, 1));
}
static INLINE vfloat2 df2geta_vf2_df2(df2 d) {
  return svcreate2_f32(svget4_f32(d, 0), svget4_f32(d, 1));
}
static INLINE vfloat2 df2getb_vf2_df2(df2 d) {
  return svcreate2_f32(svget4_f32(d, 2), svget4_f32(d, 3));
}

//

typedef svfloat64x3_t vdouble3;

static INLINE vdouble  vd3getx_vd_vd3(vdouble3 v) { return svget3_f64(v, 0); }
static INLINE vdouble  vd3gety_vd_vd3(vdouble3 v) { return svget3_f64(v, 1); }
static INLINE vdouble  vd3getz_vd_vd3(vdouble3 v) { return svget3_f64(v, 2); }
static INLINE vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z)  { return svcreate3_f64(x, y, z); }
static INLINE vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 0, d); }
static INLINE vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 1, d); }
static INLINE vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 2, d); }

//

typedef svfloat64x4_t tdx;

static INLINE vmask tdxgete_vm_tdx(tdx t) {
  return svreinterpret_s32_f64(svget4_f64(t, 0));
}
static INLINE vdouble3 tdxgetd3_vd3_tdx(tdx t) {
  return svcreate3_f64(svget4_f64(t, 1), svget4_f64(t, 2), svget4_f64(t, 3));
}
static INLINE vdouble tdxgetd3x_vd_tdx(tdx t) { return svget4_f64(t, 1); }
static INLINE vdouble tdxgetd3y_vd_tdx(tdx t) { return svget4_f64(t, 2); }
static INLINE vdouble tdxgetd3z_vd_tdx(tdx t) { return svget4_f64(t, 3); }
static INLINE tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) {
  return svset4_f64(t, 0, svreinterpret_f64_s32(e));
}
static INLINE tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) {
  return svcreate4_f64(svget4_f64(t, 0), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2));
}
static INLINE tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { return svset4_f64(t, 1, x); }
static INLINE tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { return svset4_f64(t, 2, y); }
static INLINE tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { return svset4_f64(t, 3, z); }
static INLINE tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) {
  return svcreate4_f64(svget4_f64(t, 0), x, y, z);
}

static INLINE tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) {
  return svcreate4_f64(svreinterpret_f64_s32(e), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2));
}
static INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) {
  return svcreate4_f64(svreinterpret_f64_s32(e), x, y, z);
}

//

typedef svfloat64x4_t tdi_t;

static INLINE vdouble3 tdigettd_vd3_tdi(tdi_t d) {
  return svcreate3_f64(svget4_f64(d, 0), svget4_f64(d, 1), svget4_f64(d, 2));
}
static INLINE vdouble tdigetx_vd_tdi(tdi_t d) { return svget4_f64(d, 0); }
static INLINE vint tdigeti_vi_tdi(tdi_t d) { return svreinterpret_s32_f64(svget4_f64(d, 3)); }
static INLINE tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) {
  return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2),
		       svreinterpret_f64_s32(i));
}
static INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) {
  return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2), svget4_f64(tdi, 3));
}

//

// masking predicates
#define ALL_TRUE_MASK svdup_n_s32(0xffffffff)
#define ALL_FALSE_MASK svdup_n_s32(0x0)
//@#define ALL_TRUE_MASK svdup_n_s32(0xffffffff)
//@#define ALL_FALSE_MASK svdup_n_s32(0x0)

static INLINE void vprefetch_v_p(const void *ptr) {}

//
//
//
// Test if all lanes are active
//
//
//
static INLINE int vtestallones_i_vo32(vopmask g) {
  svbool_t pg = svptrue_b32();
  return (svcntp_b32(pg, g) == svcntw());
}

static INLINE int vtestallones_i_vo64(vopmask g) {
  svbool_t pg = svptrue_b64();
  return (svcntp_b64(pg, g) == svcntd());
}
//
//
//
//
//
//

// Vector load / store
static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { svst1_s32(ptrue, p, v); }

static INLINE vfloat vload_vf_p(const float *ptr) {
  return svld1_f32(ptrue, ptr);
}
static INLINE vfloat vloadu_vf_p(const float *ptr) {
  return svld1_f32(ptrue, ptr);
}
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
  svst1_f32(ptrue, ptr, v);
}

// Basic logical operations for mask
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) {
  return svand_s32_x(ptrue, x, y);
}
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {
  return svbic_s32_x(ptrue, y, x);
}
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) {
  return svorr_s32_x(ptrue, x, y);
}
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) {
  return sveor_s32_x(ptrue, x, y);
}

static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
  return svreinterpret_s32_s64(
           svadd_s64_x(ptrue, svreinterpret_s64_s32(x),
                              svreinterpret_s64_s32(y)));
}

// Mask <--> single precision reinterpret
static INLINE vmask vreinterpret_vm_vf(vfloat vf) {
  return svreinterpret_s32_f32(vf);
}
static INLINE vfloat vreinterpret_vf_vm(vmask vm) {
  return svreinterpret_f32_s32(vm);
}
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) {
  return svreinterpret_f32_s32(vm);
}
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) {
  return svreinterpret_s32_f32(vf);
}
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }

// Conditional select
static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
  return svsel_s32(svcmpeq_s32(ptrue, m, ALL_TRUE_MASK), x, y);
}

/****************************************/
/* Single precision FP operations */
/****************************************/
// Broadcast
static INLINE vfloat vcast_vf_f(float f) { return svdup_n_f32(f); }

// Add, Sub, Mul
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
  return svadd_f32_x(ptrue, x, y);
}
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
  return svsub_f32_x(ptrue, x, y);
}
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
  return svmul_f32_x(ptrue, x, y);
}

// |x|, -x
static INLINE vfloat vabs_vf_vf(vfloat f) { return svabs_f32_x(ptrue, f); }
static INLINE vfloat vneg_vf_vf(vfloat f) { return svneg_f32_x(ptrue, f); }

// max, min
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
  return svmax_f32_x(ptrue, x, y);
}
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
  return svmin_f32_x(ptrue, x, y);
}

// int <--> float conversions
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) {
  return svcvt_s32_f32_x(ptrue, vf);
}
static INLINE vfloat vcast_vf_vi2(vint2 vi) {
  return svcvt_f32_s32_x(ptrue, vi);
}
static INLINE vint2 vcast_vi2_i(int i) { return svdup_n_s32(i); }
static INLINE vint2 vrint_vi2_vf(vfloat d) {
  return svcvt_s32_f32_x(ptrue, svrintn_f32_x(ptrue, d));
}

#if CONFIG == 1
// Multiply accumulate: z = z + x * y
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return svmad_f32_x(ptrue, x, y, z);
}
// Multiply subtract: z = z - x * y
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return svmsb_f32_x(ptrue, x, y, z);
}
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
  return svnmsb_f32_x(ptrue, x, y, z);
}
#else
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif

// fused multiply add / sub
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y,
                                      vfloat z) { // z + x * y
  return svmad_f32_x(ptrue, x, y, z);
}
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y,
                                        vfloat z) { // z - x * y
  return svmsb_f32_x(ptrue, x, y, z);
}
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y,
                                        vfloat z) { // x * y - z
  return svnmsb_f32_x(ptrue, x, y, z);
}

// conditional select
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
  return svsel_f32(mask, x, y);
}

// Reciprocal 1/x, Division, Square root
static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
#ifndef ENABLE_ALTDIV
  return svdiv_f32_x(ptrue, n, d);
#else
  // Finite numbers (including denormal) only, gives mostly correctly rounded result
  vfloat t, u, x, y;
  svuint32_t i0, i1;
  i0 = svand_u32_x(ptrue, svreinterpret_u32_f32(n), svdup_n_u32(0x7c000000));
  i1 = svand_u32_x(ptrue, svreinterpret_u32_f32(d), svdup_n_u32(0x7c000000));
  i0 = svsub_u32_x(ptrue, svdup_n_u32(0x7d000000), svlsr_n_u32_x(ptrue, svadd_u32_x(ptrue, i0, i1), 1));
  t = svreinterpret_f32_u32(i0);
  y = svmul_f32_x(ptrue, d, t);
  x = svmul_f32_x(ptrue, n, t);
  t = svrecpe_f32(y);
  t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));
  t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));
  u = svmul_f32_x(ptrue, x, t);
  u = svmad_f32_x(ptrue, svmsb_f32_x(ptrue, y, u, x), t, u);
  return u;
#endif
}
static INLINE vfloat vrec_vf_vf(vfloat d) {
#ifndef ENABLE_ALTDIV
  return svdivr_n_f32_x(ptrue, d, 1.0f);
#else
  return vsel_vf_vo_vf_vf(svcmpeq_f32(ptrue, vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
			  vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
#endif
}
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
#ifndef ENABLE_ALTSQRT
  return svsqrt_f32_x(ptrue, d);
#else
  // Gives correctly rounded result for all input range
  vfloat w, x, y, z;

  y = svrsqrte_f32(d);
  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);

  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
  w = vmul_vf_vf_vf(w, y);
  x = vmul_vf_vf_vf(w, d);
  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
  w = vfma_vf_vf_vf_vf(w, z, y);
  w = vadd_vf_vf_vf(w, x);

  return svsel_f32(svorr_b_z(ptrue, svcmpeq_f32(ptrue, d, vcast_vf_f(0)),
			     svcmpeq_f32(ptrue, d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
#endif
}
//
//
//
//
//
//
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}

static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}

static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
//
//
//
//
//
//

// truncate
static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
  return svrintz_f32_x(ptrue, vd);
}

//
//
//
// Round float
//
//
//
static INLINE vfloat vrint_vf_vf(vfloat vf) {
  return svrintn_f32_x(svptrue_b32(), vf);
}
//
//
//
//
//
//

/***************************************/
/* Single precision integer operations */
/***************************************/

// Add, Sub, Neg (-x)
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svadd_s32_x(ptrue, x, y);
}
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svsub_s32_x(ptrue, x, y);
}
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return svneg_s32_x(ptrue, e); }

// Logical operations
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svand_s32_x(ptrue, x, y);
}
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svbic_s32_x(ptrue, y, x);
}
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svorr_s32_x(ptrue, x, y);
}
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
  return sveor_s32_x(ptrue, x, y);
}

// Shifts
#define vsll_vi2_vi2_i(x, c) svlsl_n_s32_x(ptrue, x, c)
//@#define vsll_vi2_vi2_i(x, c) svlsl_n_s32_x(ptrue, x, c)
#define vsrl_vi2_vi2_i(x, c)                                                   \
  svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c))
//@#define vsrl_vi2_vi2_i(x, c) svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c))
#define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c)
//@#define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c)

// Comparison returning integers
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svsel_s32(svcmpgt_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}

// conditional select
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
  return svsel_s32(m, x, y);
}

/****************************************/
/* opmask operations                    */
/****************************************/
// single precision FP
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
  return svcmpeq_f32(ptrue, x, y);
}
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
  return svcmpne_f32(ptrue, x, y);
}
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
  return svcmplt_f32(ptrue, x, y);
}
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
  return svcmple_f32(ptrue, x, y);
}
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
  return svcmpgt_f32(ptrue, x, y);
}
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
  return svcmpge_f32(ptrue, x, y);
}
static INLINE vopmask visinf_vo_vf(vfloat d) {
  return svcmpeq_n_f32(ptrue, vabs_vf_vf(d), SLEEF_INFINITYf);
}
static INLINE vopmask vispinf_vo_vf(vfloat d) {
  return svcmpeq_n_f32(ptrue, d, SLEEF_INFINITYf);
}
static INLINE vopmask visminf_vo_vf(vfloat d) {
  return svcmpeq_n_f32(ptrue, d, -SLEEF_INFINITYf);
}
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }

// integers
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
  return svcmpeq_s32(ptrue, x, y);
}
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
  return svcmpgt_s32(ptrue, x, y);
}

// logical opmask
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
  return svand_b_z(ptrue, x, y);
}
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
  return svbic_b_z(ptrue, y, x);
}
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
  return svorr_b_z(ptrue, x, y);
}
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
  return sveor_b_z(ptrue, x, y);
}

static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
  // This needs to be zeroing to prevent asinf and atanf denormal test
  // failing.
  return svand_s32_z(x, y, y);
}

// bitmask logical operations
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) {
  return svsel_s32(x, y, ALL_FALSE_MASK);
}
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
  return svsel_s32(x, ALL_FALSE_MASK, y);
}
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) {
  return svsel_s32(x, ALL_TRUE_MASK, y);
}

// broadcast bitmask
static INLINE vmask vcast_vm_i_i(int i0, int i1) {
  return svreinterpret_s32_u64(
      svdup_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
}

/*********************************/
/* SVE for double precision math */
/*********************************/

// Vector load/store
static INLINE vdouble vload_vd_p(const double *ptr) {
  return svld1_f64(ptrue, ptr);
}
static INLINE vdouble vloadu_vd_p(const double *ptr) {
  return svld1_f64(ptrue, ptr);
}
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
  svst1_f64(ptrue, ptr, v);
}

static INLINE void vstoreu_v_p_vi(int *ptr, vint v) {
  svst1w_s64(ptrue, ptr, svreinterpret_s64_s32(v));
}
static vint vloadu_vi_p(int32_t *p) {
  return svreinterpret_s32_s64(svld1uw_s64(ptrue, (uint32_t *)p));
}

// Reinterpret
static INLINE vdouble vreinterpret_vd_vm(vmask vm) {
  return svreinterpret_f64_s32(vm);
}
static INLINE vmask vreinterpret_vm_vd(vdouble vd) {
  return svreinterpret_s32_f64(vd);
}
static INLINE vdouble vreinterpret_vd_vi2(vint2 x) {
  return svreinterpret_f64_s32(x);
}
static INLINE vint2 vreinterpret_vi2_vd(vdouble x) {
  return svreinterpret_s32_f64(x);
}
static INLINE vint2 vcastu_vi2_vi(vint x) {
  return svreinterpret_s32_s64(
      svlsl_n_s64_x(ptrue, svreinterpret_s64_s32(x), 32));
}
static INLINE vint vcastu_vi_vi2(vint2 x) {
  return svreinterpret_s32_u64(
      svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), 32));
}
static INLINE vdouble vcast_vd_vi(vint vi) {
  return svcvt_f64_s32_x(ptrue, vi);
}

// Splat
static INLINE vdouble vcast_vd_d(double d) { return svdup_n_f64(d); }

// Conditional select
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) {
  return svsel_f64(o, x, y);
}

static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}

static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}

static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}

static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) {
  return svsel_s32(o, x, y);
}
// truncate
static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
  return svrintz_f64_x(ptrue, vd);
}
static INLINE vint vtruncate_vi_vd(vdouble vd) {
  return svcvt_s32_f64_x(ptrue, vd);
}
static INLINE vint vrint_vi_vd(vdouble vd) {
  return svcvt_s32_f64_x(ptrue, svrintn_f64_x(ptrue, vd));
}
static INLINE vdouble vrint_vd_vd(vdouble vd) {
  return svrintn_f64_x(ptrue, vd);
}

// FP math operations
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
  return svadd_f64_x(ptrue, x, y);
}
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
  return svsub_f64_x(ptrue, x, y);
}
static INLINE vdouble vneg_vd_vd(vdouble x) { return svneg_f64_x(ptrue, x); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
  return svmul_f64_x(ptrue, x, y);
}
static INLINE vdouble vabs_vd_vd(vdouble x) { return svabs_f64_x(ptrue, x); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
  return svmax_f64_x(ptrue, x, y);
}
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
  return svmin_f64_x(ptrue, x, y);
}

#if CONFIG == 1
// Multiply accumulate / subtract
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y,
                                       vdouble z) { // z = x*y + z
  return svmad_f64_x(ptrue, x, y, z);
}
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y,
                                         vdouble z) { // z = x * y - z
  return svnmsb_f64_x(ptrue, x, y, z);
}
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
  return svmsb_f64_x(ptrue, x, y, z);
}
#else
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
#endif

static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y,
                                       vdouble z) { // z + x * y
  return svmad_f64_x(ptrue, x, y, z);
}
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y,
                                         vdouble z) { // z - x * y
  return svmsb_f64_x(ptrue, x, y, z);
}
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y,
                                         vdouble z) { // x * y - z
  return svnmsb_f64_x(ptrue, x, y, z);
}

// Reciprocal 1/x, Division, Square root
static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
#ifndef ENABLE_ALTDIV
  return svdiv_f64_x(ptrue, n, d);
#else
  // Finite numbers (including denormal) only, gives mostly correctly rounded result
  vdouble t, u, x, y;
  svuint64_t i0, i1;
  i0 = svand_u64_x(ptrue, svreinterpret_u64_f64(n), svdup_n_u64(0x7fc0000000000000L));
  i1 = svand_u64_x(ptrue, svreinterpret_u64_f64(d), svdup_n_u64(0x7fc0000000000000L));
  i0 = svsub_u64_x(ptrue, svdup_n_u64(0x7fd0000000000000L), svlsr_n_u64_x(ptrue, svadd_u64_x(ptrue, i0, i1), 1));
  t = svreinterpret_f64_u64(i0);
  y = svmul_f64_x(ptrue, d, t);
  x = svmul_f64_x(ptrue, n, t);
  t = svrecpe_f64(y);
  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
  u = svmul_f64_x(ptrue, x, t);
  u = svmad_f64_x(ptrue, svmsb_f64_x(ptrue, y, u, x), t, u);
  return u;
#endif
}
static INLINE vdouble vrec_vd_vd(vdouble d) {
#ifndef ENABLE_ALTDIV
  return svdivr_n_f64_x(ptrue, d, 1.0);
#else
  return vsel_vd_vo_vd_vd(svcmpeq_f64(ptrue, vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
			  vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
#endif
}
static INLINE vdouble vsqrt_vd_vd(vdouble d) {
#ifndef ENABLE_ALTSQRT
  return svsqrt_f64_x(ptrue, d);
#else
  // Gives correctly rounded result for all input range
  vdouble w, x, y, z;

  y = svrsqrte_f64(d);
  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);

  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
  w = vmul_vd_vd_vd(w, y);
  x = vmul_vd_vd_vd(w, d);
  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
  w = vfma_vd_vd_vd_vd(w, z, y);
  w = vadd_vd_vd_vd(w, x);

  return svsel_f64(svorr_b_z(ptrue, svcmpeq_f64(ptrue, d, vcast_vd_d(0)),
			     svcmpeq_f64(ptrue, d, vcast_vd_d(SLEEF_INFINITY))), d, w);
#endif
}

// Float comparison
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
  return svcmplt_f64(ptrue, x, y);
}
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
  return svcmpeq_f64(ptrue, x, y);
}
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
  return svcmpgt_f64(ptrue, x, y);
}
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
  return svcmpge_f64(ptrue, x, y);
}
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
  return svcmpne_f64(ptrue, x, y);
}
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
  return svcmple_f64(ptrue, x, y);
}

// predicates
static INLINE vopmask visnan_vo_vd(vdouble vd) {
  return svcmpne_f64(ptrue, vd, vd);
}
static INLINE vopmask visinf_vo_vd(vdouble vd) {
  return svcmpeq_n_f64(ptrue, svabs_f64_x(ptrue, vd), SLEEF_INFINITY);
}
static INLINE vopmask vispinf_vo_vd(vdouble vd) {
  return svcmpeq_n_f64(ptrue, vd, SLEEF_INFINITY);
}
static INLINE vopmask visminf_vo_vd(vdouble vd) {
  return svcmpeq_n_f64(ptrue, vd, -SLEEF_INFINITY);
}

// Comparing bit masks
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
  return svcmpeq_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y));
}

// pure predicate operations
static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }

// logical integer operations
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) {
  // This needs to be a zeroing instruction because we need to make
  // sure that the inactive elements for the unpacked integers vector
  // are zero.
  return svand_s32_z(x, y, y);
}

static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {
  return svsel_s32(x, ALL_FALSE_MASK, y);
}
#define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c)
//@#define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c)
#define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c)
//@#define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c)

static INLINE vint vsrl_vi_vi_i(vint x, int c) {
  return svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c));
}

static INLINE vint vand_vi_vi_vi(vint x, vint y) {
  return svand_s32_x(ptrue, x, y);
}
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) {
  return svbic_s32_x(ptrue, y, x);
}
static INLINE vint vxor_vi_vi_vi(vint x, vint y) {
  return sveor_s32_x(ptrue, x, y);
}

// integer math
static INLINE vint vadd_vi_vi_vi(vint x, vint y) {
  return svadd_s32_x(ptrue, x, y);
}
static INLINE vint vsub_vi_vi_vi(vint x, vint y) {
  return svsub_s32_x(ptrue, x, y);
}
static INLINE vint vneg_vi_vi(vint x) { return svneg_s32_x(ptrue, x); }

// integer comparison
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
  return svcmpgt_s32(ptrue, x, y);
}
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
  return svcmpeq_s32(ptrue, x, y);
}

// Splat
static INLINE vint vcast_vi_i(int i) { return svdup_n_s32(i); }

// bitmask logical operations
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) {
  // This needs to be a zeroing instruction because we need to make
  // sure that the inactive elements for the unpacked integers vector
  // are zero.
  return svreinterpret_s32_s64(
      svand_s64_z(x, svreinterpret_s64_s32(y), svreinterpret_s64_s32(y)));
}
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
  return svreinterpret_s32_s64(svsel_s64(
      x, svreinterpret_s64_s32(ALL_FALSE_MASK), svreinterpret_s64_s32(y)));
}
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) {
  return svreinterpret_s32_s64(svsel_s64(
      x, svreinterpret_s64_s32(ALL_TRUE_MASK), svreinterpret_s64_s32(y)));
}

static INLINE vfloat vrev21_vf_vf(vfloat vf) {
  return svreinterpret_f32_u64(svrevw_u64_x(ptrue, svreinterpret_u64_f32(vf)));
}

static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }

// Comparison returning integer
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
  return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);
}

// Gather

static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
  return svld1_gather_s64index_f64(ptrue, ptr, svreinterpret_s64_s32(vi));
}

static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
  return svld1_gather_s32index_f32(ptrue, ptr, vi2);
}

// Operations for DFT

static INLINE vdouble vposneg_vd_vd(vdouble d) {
  return svneg_f64_m(d, svdupq_n_b64(0, 1), d);
}

static INLINE vdouble vnegpos_vd_vd(vdouble d) {
  return svneg_f64_m(d, svdupq_n_b64(1, 0), d);
}

static INLINE vfloat vposneg_vf_vf(vfloat d) {
  return svneg_f32_m(d, svdupq_n_b32(0, 1, 0, 1), d);
}

static INLINE vfloat vnegpos_vf_vf(vfloat d) {
  return svneg_f32_m(d, svdupq_n_b32(1, 0, 1, 0), d);
}

static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }

//

static INLINE vdouble vrev21_vd_vd(vdouble x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); }

static INLINE vdouble vreva2_vd_vd(vdouble vd) {
  svint64_t x = svindex_s64((VECTLENDP-1), -1);
  x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x));
  return svtbl_f64(vd, svreinterpret_u64_s64(x));
}

static INLINE vfloat vreva2_vf_vf(vfloat vf) {
  svint32_t x = svindex_s32((VECTLENSP-1), -1);
  x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x));
  return svtbl_f32(vf, svreinterpret_u32_s32(x));
}

//

static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
  svst1_scatter_u64index_f64(ptrue, ptr + offset*2, svzip1_u64(svindex_u64(0, step*2), svindex_u64(1, step*2)), v);
}

static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
  svst1_scatter_u32index_f32(ptrue, ptr + offset*2, svzip1_u32(svindex_u32(0, step*2), svindex_u32(1, step*2)), v);
}

static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vstoreu_v_p_vd(ptr, v); }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v); }
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }

// These functions are for debugging
static double vcast_d_vd(vdouble v) {
  double a[svcntd()];
  vstoreu_v_p_vd(a, v);
  return a[0];
}

static float vcast_f_vf(vfloat v) {
  float a[svcntw()];
  vstoreu_v_p_vf(a, v);
  return a[0];
}

static int vcast_i_vi(vint v) {
  int a[svcntw()];
  vstoreu_v_p_vi(a, v);
  return a[0];
}

static int vcast_i_vi2(vint2 v) {
  int a[svcntw()];
  vstoreu_v_p_vi2(a, v);
  return a[0];
}

//

static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
  return vm2setxy_vm2_vm_vm(svreinterpret_s32_u64(svtrn1_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))),
			    svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))));
}

static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
  return vm2setxy_vm2_vm_vm(svreinterpret_s32_u64(svtrn1_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))),
			    svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))));
}

static INLINE vint vuninterleave_vi_vi(vint v) {
  return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v)),
					  svtrn2_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v))));
}

static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
  return svtrn1_f64(svzip1_f64(vd, vd), svzip2_f64(vd, vd));
}

static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
  return svuzp1_f64(svtrn1_f64(vd, vd), svtrn2_f64(vd, vd));
}

static INLINE vmask vinterleave_vm_vm(vmask vm) {
  return svreinterpret_s32_u64(svtrn1_u64(svzip1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),
					  svzip2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));
}
static INLINE vmask vuninterleave_vm_vm(vmask vm) {
  return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),
					  svtrn2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));
}

static vmask2 vloadu_vm2_p(void *p) {
  vmask2 vm2;
  memcpy(&vm2, p, VECTLENDP * 16);
  return vm2;
}

#if !defined(SLEEF_GENHEADER)
typedef Sleef_quadx vargquad;

static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));
}

static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
  vm2 = vuninterleave_vm2_vm2(vm2);
  vargquad aq;
  memcpy(&aq, &vm2, VECTLENDP * 16);
  return aq;
}
#endif // #if !defined(SLEEF_GENHEADER)

static INLINE int vtestallzeros_i_vo64(vopmask g) {
  return svcntp_b64(svptrue_b64(), g) == 0;
}

static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
  return svreinterpret_s32_s64(svsel_s64(o, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y)));
}

static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
  return svreinterpret_s32_s64(
           svsub_s64_x(ptrue, svreinterpret_s64_s32(x),
                              svreinterpret_s64_s32(y)));
}

static INLINE vmask vneg64_vm_vm(vmask x) {
  return svreinterpret_s32_s64(svneg_s64_x(ptrue, svreinterpret_s64_s32(x)));
}

static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
  return svcmpgt_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y));
}

#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))
//@#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))
#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))
//@#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))

static INLINE vmask vcast_vm_vi(vint vi) { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); }
static INLINE vint vcast_vi_vm(vmask vm) { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); }


================================================
FILE: src/memory.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifdef NSIMD_IS_MSVC
  #include <malloc.h>
#else
  #ifndef _POSIX_C_SOURCE
    #define _POSIX_C_SOURCE 200112L
  #endif
  #include <stdlib.h>
#endif

// ----------------------------------------------------------------------------

#define NSIMD_INSIDE
#include <nsimd/nsimd.h>

// ----------------------------------------------------------------------------

extern "C" {

NSIMD_DLLEXPORT void *nsimd_aligned_alloc(nsimd_nat n) {
#ifdef NSIMD_IS_MSVC
  return _aligned_malloc(n, NSIMD_MAX_ALIGNMENT);
#else
  void *ptr;
  if (posix_memalign(&ptr, NSIMD_MAX_ALIGNMENT, (size_t)n)) {
    return NULL;
  } else {
    return ptr;
  }
#endif
}

// ----------------------------------------------------------------------------

NSIMD_DLLEXPORT void nsimd_aligned_free(void *ptr) {
#ifdef NSIMD_IS_MSVC
  _aligned_free(ptr);
#else
  free(ptr);
#endif
}

} // extern "C"


================================================
FILE: src/misc.h
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

//

#ifndef __MISC_H__
#define __MISC_H__

#if !defined(SLEEF_GENHEADER)
#include <stdint.h>
#include <string.h>
#endif

#ifndef M_PI
#define M_PI 3.141592653589793238462643383279502884
#endif

#ifndef M_PIl
#define M_PIl 3.141592653589793238462643383279502884L
#endif

#ifndef M_1_PI
#define M_1_PI 0.318309886183790671537767526745028724
#endif

#ifndef M_1_PIl
#define M_1_PIl 0.318309886183790671537767526745028724L
#endif

#ifndef M_2_PI
#define M_2_PI 0.636619772367581343075535053490057448
#endif

#ifndef M_2_PIl
#define M_2_PIl 0.636619772367581343075535053490057448L
#endif

#ifndef SLEEF_FP_ILOGB0
#define SLEEF_FP_ILOGB0 ((int)-2147483648)
#endif

#ifndef SLEEF_FP_ILOGBNAN
#define SLEEF_FP_ILOGBNAN ((int)2147483647)
#endif

#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)


//

/*
  PI_A to PI_D are constants that satisfy the following two conditions.

  * For PI_A, PI_B and PI_C, the last 28 bits are zero.
  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.

  The argument of a trig function is multiplied by 1/PI, and the
  integral part is divided into two parts, each has at most 28
  bits. So, the maximum argument that could be correctly reduced
  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
  double precision calculation, the actual maximum argument that can
  be correctly reduced is around 2^47.
 */

#define PI_A 3.1415926218032836914
#define PI_B 3.1786509424591713469e-08
#define PI_C 1.2246467864107188502e-16
#define PI_D 1.2736634327021899816e-24
#define TRIGRANGEMAX 1e+14

/*
  PI_A2 and PI_B2 are constants that satisfy the following two conditions.

  * The last 3 bits of PI_A2 are zero.
  * PI_A2 + PI_B2 is close to PI as much as possible.

  The argument of a trig function is multiplied by 1/PI, and the
  integral part is multiplied by PI_A2. So, the maximum argument that
  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
  we confirmed that it correctly reduces the argument up to around 15.
 */

#define PI_A2 3.141592653589793116
#define PI_B2 1.2246467991473532072e-16
#define TRIGRANGEMAX2 15

#define M_2_PI_H 0.63661977236758138243
#define M_2_PI_L -3.9357353350364971764e-17

#define SQRT_DBL_MAX 1.3407807929942596355e+154

#define TRIGRANGEMAX3 1e+9

#define M_4_PI 1.273239544735162542821171882678754627704620361328125

#define L2U .69314718055966295651160180568695068359375
#define L2L .28235290563031577122588448175013436025525412068e-12
#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931

#define L10U 0.30102999566383914498 // log 2 / log 10
#define L10L 1.4205023227266099418e-13
#define LOG10_2 3.3219280948873623478703194294893901758648313930

#define L10Uf 0.3010253906f
#define L10Lf 4.605038981e-06f

//

#define PI_Af 3.140625f
#define PI_Bf 0.0009670257568359375f
#define PI_Cf 6.2771141529083251953e-07f
#define PI_Df 1.2154201256553420762e-10f
#define TRIGRANGEMAXf 39000

#define PI_A2f 3.1414794921875f
#define PI_B2f 0.00011315941810607910156f
#define PI_C2f 1.9841872589410058936e-09f
#define TRIGRANGEMAX2f 125.0f

#define TRIGRANGEMAX4f 8e+6f

#define SQRT_FLT_MAX 18446743523953729536.0

#define L2Uf 0.693145751953125f
#define L2Lf 1.428606765330187045e-06f

#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
#define M_PIf ((float)M_PI)

//

#ifndef MIN
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#endif

#ifndef MAX
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#endif

#ifndef ABS
#define ABS(x) ((x) < 0 ? -(x) : (x))
#endif

#define stringify(s) stringify_(s)
#define stringify_(s) #s

#if !defined(SLEEF_GENHEADER)
typedef long double longdouble;
#endif

#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_double2_DEFINED
typedef struct {
  double x, y;
} Sleef_double2;
#endif

#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_float2_DEFINED
typedef struct {
  float x, y;
} Sleef_float2;
#endif

#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_longdouble2_DEFINED
typedef struct {
  long double x, y;
} Sleef_longdouble2;
#endif

#if !defined(Sleef_quad_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_quad_DEFINED
#if defined(ENABLEFLOAT128)
typedef __float128 Sleef_quad;
#else
typedef struct { double x, y; } Sleef_quad;
#endif
#endif

#if !defined(Sleef_quad1_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_quad1_DEFINED
typedef union {
  struct {
    Sleef_quad x;
  };
  Sleef_quad s[1];
} Sleef_quad1;
#endif

#if !defined(Sleef_quad2_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_quad2_DEFINED
typedef union {
  struct {
    Sleef_quad x, y;
  };
  Sleef_quad s[2];
} Sleef_quad2;
#endif

#if !defined(Sleef_quad4_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_quad4_DEFINED
typedef union {
  struct {
    Sleef_quad x, y, z, w;
  };
  Sleef_quad s[4];
} Sleef_quad4;
#endif

#if !defined(Sleef_quad8_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_quad8_DEFINED
typedef union {
  Sleef_quad s[8];
} Sleef_quad8;
#endif

#if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED) && !defined(SLEEF_GENHEADER)
#define Sleef_quadx_DEFINED
typedef union {
  Sleef_quad s[32];
} Sleef_quadx;
#endif

//

#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)

#define LIKELY(condition) __builtin_expect(!!(condition), 1)
#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
#define RESTRICT __restrict__

#ifndef __arm__
#define ALIGNED(x) __attribute__((aligned(x)))
#else
#define ALIGNED(x)
#endif

#if defined(SLEEF_GENHEADER)

#define INLINE SLEEF_ALWAYS_INLINE
#define EXPORT SLEEF_INLINE
#define CONST SLEEF_CONST
#define NOEXPORT

#else // #if defined(SLEEF_GENHEADER)

#ifndef __INTEL_COMPILER
#define CONST const
#else
#define CONST
#endif
#define INLINE __attribute__((always_inline))

#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#ifndef SLEEF_STATIC_LIBS
#define EXPORT __stdcall __declspec(dllexport)
#define NOEXPORT
#else // #ifndef SLEEF_STATIC_LIBS
#define EXPORT
#define NOEXPORT
#endif // #ifndef SLEEF_STATIC_LIBS
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
#define EXPORT __attribute__((visibility("default")))
#define NOEXPORT __attribute__ ((visibility ("hidden")))
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)

#endif // #if defined(SLEEF_GENHEADER)

#define SLEEF_NAN __builtin_nan("")
#define SLEEF_NANf __builtin_nanf("")
#define SLEEF_NANl __builtin_nanl("")
#define SLEEF_INFINITY __builtin_inf()
#define SLEEF_INFINITYf __builtin_inff()
#define SLEEF_INFINITYl __builtin_infl()

#if defined(__INTEL_COMPILER) || defined (__clang__)
#define SLEEF_INFINITYq __builtin_inf()
#define SLEEF_NANq __builtin_nan("")
#else
#define SLEEF_INFINITYq __builtin_infq()
#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
#endif

#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)

#define INLINE __forceinline
#define CONST
#define RESTRICT
#define ALIGNED(x)
#define LIKELY(condition) (condition)
#define UNLIKELY(condition) (condition)

#ifndef SLEEF_STATIC_LIBS
#define EXPORT __declspec(dllexport)
#define NOEXPORT
#else
#define EXPORT
#define NOEXPORT
#endif

#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
#include <x86intrin.h>
#endif

#define SLEEF_INFINITY (1e+300 * 1e+300)
#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
#define SLEEF_NANf ((float)SLEEF_NAN)
#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
#define SLEEF_NANl ((long double)SLEEF_NAN)

#if (defined(_M_AMD64) || defined(_M_X64))
#ifndef __SSE2__
#define __SSE2__
#define __SSE3__
#define __SSE4_1__
#endif
#elif _M_IX86_FP == 2
#ifndef __SSE2__
#define __SSE2__
#define __SSE3__
#define __SSE4_1__
#endif
#elif _M_IX86_FP == 1
#ifndef __SSE__
#define __SSE__
#endif
#endif

#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)

#if !defined(__linux__)
#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
#define isnanf(x) ((x) != (x))
#define isnanl(x) ((x) != (x))
#endif

#endif // #ifndef __MISC_H__

#ifdef ENABLE_AAVPCS
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
#else
#define VECTOR_CC
#endif


        /* NSIMD specific */
        #ifndef NSIMD_SLEEF_MISC_H
        #define NSIMD_SLEEF_MISC_H

        #ifdef INLINE
        #undef INLINE
        #endif
        #define INLINE inline

        #define Sleef_rempitabdp nsimd_sleef_rempitab_f64
        #define Sleef_rempitabsp nsimd_sleef_rempitab_f32

        #endif

        
================================================
FILE: src/rempitab.c
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

#include "misc.h"

#if !defined(SLEEF_GENHEADER)
#define FUNCATR NOEXPORT ALIGNED(64)
#else
#define FUNCATR EXPORT ALIGNED(64)
#endif

FUNCATR const double Sleef_rempitabdp[] = {
  0.15915494309189531785, 1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49,
  0.03415494309189533173, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49,
  0.03415494309189533173, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49,
  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,
  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,
  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,
  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,
  0.00095181809189533563356, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762436344e-52,
  0.00095181809189533563356, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762436344e-52,
  0.00046353684189533574198, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.301187206862134399e-54,
  0.00021939621689533574198, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.301187206862134399e-54,
  9.7325904395335769087e-05, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  3.6290748145335769087e-05, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  1.9584727547107690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,
  2.1321799510573569745e-08, 1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369025999e-57,
  6.4206383167259151492e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57,
  6.4206383167259151492e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57,
  2.6953480182640010867e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57,
  8.3270286903304384868e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59,
  8.3270286903304384868e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59,
  3.6704158172530459087e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59,
  1.3421093807143501366e-10, 1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59,
  1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,
  1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,
  1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,
  3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,
  3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,
  3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,
  1.4247116125875099096e-12, 2.5861333686050385673e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61,
  5.1521691081458187359e-13, 5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61,
  6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62,
  6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62,
  6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62,
  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,
  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,
  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,
  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,
  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,
  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,
  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,
  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,
  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,
  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,
  1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,
  1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,
  4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,
  4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,
  5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67,
  5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67,
  5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67,
  1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68,
  1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68,
  2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,
  2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,
  2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,
  1.3348904870778067446e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,
  6.5726412927436632287e-21, 1.0820844071023395684e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68,
  3.1845095037264626247e-21, 3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,
  1.4904436092178623228e-21, -4.6390169687056261795e-38, -1.1392999419355048437e-54, -4.587677453735884283e-71,
  6.4341066196356198368e-22, -4.6390169687056261795e-38, -1.1392999419355048437e-54, -4.587677453735884283e-71,
  2.1989418833641172011e-22, 4.7649378378726728402e-38, 9.3011872068621332399e-54, 1.113250147552460308e-69,
  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  6.9132600985943383921e-25, 7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73,
  2.7773570358292009361e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57, -3.2399200798614356002e-74,
  7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,
  7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,
  1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,
  1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,
  6.317065088957874881e-27, -3.2976062348358281152e-43, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  6.317065088957874881e-27, -3.2976062348358281152e-43, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  3.0858908211726098086e-27, 3.8770419025072344914e-43, 7.9392906424978921242e-59, 2.9745456030524896742e-75,
  1.4703036872799779898e-27, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  6.625101203336619011e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  2.5861333686050385673e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008290615e-78,
  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,
  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,
  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,
  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,
  3.0224035688960604996e-30, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,
  1.4446817584540368888e-30, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,
  6.5582085323302525856e-31, 7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639313137e-79,
  2.6139040062251944343e-31, -1.7578597149294783985e-47, 8.4432539107728090768e-64, 1.9517662449371102229e-79,
  6.4175174317266470186e-32, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371102229e-79,
  6.4175174317266470186e-32, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371102229e-79,
  1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659569668e-65, -7.2335760163150273591e-81,
  1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659569668e-65, -7.2335760163150273591e-81,
  2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,
  2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,
  2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,
  1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,
  2.3430016361024414106e-34, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82,
  2.3430016361024414106e-34, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82,
  4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83,
  4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83,
  4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83,
  1.7633044866680145008e-35, 2.8491136916798196016e-51, 4.0680767287898916022e-67, 1.4185069655957361252e-83,
  5.595982714259923599e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,
  5.595982714259923599e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,
  2.5867171761548675786e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,
  1.0820844071023395684e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,
  3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,
  3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,
  1.4168892644450972904e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,
  4.7649378378726728402e-38, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  3.2673620808294506214e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,
  9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,
  9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,
  3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,
  1.1051690039850297894e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,
  1.1051690039850297894e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,
  3.8770419025072344914e-43, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,
  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,
  7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.5355611056488084652e-94,
  7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.5355611056488084652e-94,
  2.6211979860855749482e-47, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,
  4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,
  4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,
  4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,
  1.5797802926460750146e-48, 2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99,
  2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99,
  2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99,
  2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99,
  4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,
  4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,
  4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,
  1.8885701952232994665e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,
  8.1946431118642097069e-51, 1.5937536410989638719e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99,
  2.8491136916798196016e-51, 4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100,
  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,
  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,
  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,
  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,
  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,
  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,
  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,
  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,
  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,
  4.0809436324633147776e-54, -4.587677453735884283e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103,
  1.470821845263904967e-54, -4.587677453735884283e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103,
  1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103,
  1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103,
  1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103,
  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,
  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,
  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,
  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,
  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,
  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,
  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  3.9565608646667614317e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  1.9651959757511960854e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,
  9.6951353129341363331e-60, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108,
  4.7167230906452229674e-60, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108,
  2.2275169795007668372e-60, 2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108,
  9.8291392392853877215e-61, -6.5385728340754726503e-77, -1.3520652573660833788e-93, -2.3220403312043059402e-109,
  3.6061239614242446325e-61, 7.2792968540756372162e-77, 1.3988851821689310822e-92, 1.0451839188820145747e-108,
  4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110,
  4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110,
  4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110,
  1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.535561105648808199e-94, -1.9306041120023063932e-110,
  1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.535561105648808199e-94, -1.9306041120023063932e-110,
  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,
  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,
  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,
  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,
  2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  8.4679971416497210292e-65, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  3.9676455775389135587e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  1.5937536410989638719e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115,
  4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,
  4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,
  1.1007118082399544936e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,
  1.1007118082399544936e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,
  3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115,
  3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115,
  1.7341027056809927069e-68, 1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418876704e-116,
  8.0680116800913756637e-69, -2.2809159455312046184e-85, -4.0748824503880445403e-101, -6.3915272253158644628e-117,
  3.4315039917320989315e-69, -2.2809159455312046184e-85, -4.0748824503880445403e-101, -6.3915272253158644628e-117,
  1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,
  1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,
  5.3368668650755071652e-70, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,
  2.4390495598509592076e-70, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,
  9.901409072386855505e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103, -4.6672632026740766185e-119,
  2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,
  2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,
  8.4572999356014273536e-72, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,
  8.4572999356014273536e-72, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,
  3.9294603961880721752e-72, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894729832e-121,
  1.6655406264813940833e-72, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894729832e-121,
  5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,
  5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,
  2.5059077041472040156e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,
  1.0909578480805302081e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,
  3.8348292004719330442e-74, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,
  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,
  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,
  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,
  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,
  7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125,
  7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125,
  2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125,
  2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125,
  7.2792968540756372162e-77, 1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125,
  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,
  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,
  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,
  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,
  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,
  1.5445779612272179051e-78, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126,
  4.6505689184041232695e-79, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126,
  4.6505689184041232695e-79, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126,
  1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128,
  6.0236490820360325022e-80, -3.7424672147304925625e-96, -1.784871512364483542e-112, 6.7095375687163151728e-129,
  6.0236490820360325022e-80, -3.7424672147304925625e-96, -1.784871512364483542e-112, 6.7095375687163151728e-129,
  2.6501457402022643213e-80, 3.7482149527770239293e-96, 6.5314563001514349095e-112, 9.9039323746573674262e-128,
  9.6339406928538097998e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,
  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,
  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,
  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,
  5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,
  1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132,
  1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132,
  8.0141992334048515034e-85, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132,
  2.8666416439368237283e-85, 1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136,
  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,
  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,
  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,
  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,
  1.3200167453193350837e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,
  5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136,
  1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136,
  1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136,
  1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137,
  1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137,
  1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137,
  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,
  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,
  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,
  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,
  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,
  5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139,
  5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139,
  5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139,
  1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  6.3183932821616130831e-93, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,
  2.4831640123977650651e-93, 1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007823264e-142,
  5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251471293e-126, 1.2214168761472102282e-142,
  5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251471293e-126, 1.2214168761472102282e-142,
  8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142,
  8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142,
  8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142,
  2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,
  2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,
  1.1238897120284541253e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,
  3.7482149527770239293e-96, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  1.0450891972142805974e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,
  1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148,
  1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148,
  1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148,
  1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,
  1.0404514546648604359e-103, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,
  1.0404514546648604359e-103, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,
  4.8235214251531210473e-104, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,
  2.0330248644053793915e-104, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,
  6.3777658403150887343e-105, -2.0152904854894725532e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153,
  6.3777658403150887343e-105, -2.0152904854894725532e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153,
  2.88964513938041089e-105, 5.7298933442091639924e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153,
  1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153,
  2.7355461367940366859e-106, -7.8994528064813712419e-123, -2.0037599452814940222e-138, 9.1598554579059548847e-155,
  2.7355461367940366859e-106, -7.8994528064813712419e-123, -2.0037599452814940222e-138, 9.1598554579059548847e-155,
  5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155,
  5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155,
  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,
  1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,
  1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,
  8.7142954880180709975e-110, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,
  3.3918456880078814158e-110, 6.931443500908017045e-126, 1.1062055705591186799e-141, 1.1734404793201255869e-157,
  7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220312367e-159,
  7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220312367e-159,
  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,
  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,
  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,
  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,
  2.3732923938934761454e-112, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,
  2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,
  2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,
  2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,
  3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161,
  3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161,
  3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161,
  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,
  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,
  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,
  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,
  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,
  8.2436437080731844263e-116, 1.4726412753514008951e-131, -3.9681466199873824165e-148, 2.9106774506606945839e-164,
  3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942429241e-163,
  6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164,
  6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164,
  6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164,
  3.1257546646178208289e-117, -6.6414926959353515111e-134, -5.7828074707888119584e-150, -1.2825052715093464343e-165,
  1.5395410162955400644e-117, -6.6414926959353515111e-134, -5.7828074707888119584e-150, -1.2825052715093464343e-165,
  7.4643419213439950602e-118, 1.0969016447485317626e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165,
  3.4988078005382940294e-118, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166,
  1.5160407401354430737e-118, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166,
  5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,
  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,
  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,
  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,
  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,
  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,
  1.3475077173907800538e-120, -3.156241481857667737e-137, -7.0684085473731388916e-153, -3.3573283875161501977e-170,
  5.7298933442091639924e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153, -3.3573283875161501977e-170,
  1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170,
  1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170,
  8.8915345064751572143e-122, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170,
  4.0507946129135104481e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,
  1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,
  4.2023969274227456735e-123, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,
  4.2023969274227456735e-123, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,
  1.1769344939467164447e-123, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172,
  1.1769344939467164447e-123, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172,
  4.2056888557770896953e-124, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172,
  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,
  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,
  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,
  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,
  1.8749656131673758844e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,
  6.931443500908017045e-126, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,
  1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174,
  1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174,
  1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174,
  2.8369889610228834887e-127, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176,
  2.8369889610228834887e-127, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176,
  9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657612913e-160, -2.5389576707476506925e-176,
  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,
  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,
  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,
  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,
  9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177,
  9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177,
  9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177,
  2.175994780857201024e-130, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  2.175994780857201024e-130, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  1.4726412753514008951e-131, -3.9681466199873824165e-148, 2.9106774506606941983e-164, 5.1948630316441296498e-180,
  3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179,
  6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180,
  6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180,
  6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180,
  2.8579525590905986764e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, -1.0696067158221530218e-181,
  1.0969016447485317626e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, -1.0696067158221530218e-181,
  2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182,
  2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182,
  2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182,
  1.0631050543111905033e-134, 1.5490398016102376505e-150, 3.4549185946116918017e-166, 1.3535321672928907047e-182,
  5.1277664357929471499e-135, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182,
  2.3761243821334675971e-135, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182,
  1.0003033553037281263e-135, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,
  3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,
  3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,
  1.4041521353514076604e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,
  5.4426399358282049106e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186,
  1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186,
  1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186,
  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,
  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,
  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,
  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,
  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,
  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,
  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,
  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,
  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,
  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,
  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  4.5016298192952031469e-142, -2.8326669474241479263e-158, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191,
  4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193,
  4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193,
  1.9635033141346264592e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193,
  9.3843676940087855824e-144, 1.2626949989038732076e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191,
  4.2590349703400483539e-144, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896458822e-192,
  1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896458822e-192,
  4.1503542758849472122e-145, -1.7614040799531193879e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,
  4.1503542758849472122e-145, -1.7614040799531193879e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,
  9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,
  9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,
  1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  2.105789206980137775e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,
  2.2883630524598079723e-148, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091032843e-196,
  2.2883630524598079723e-148, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091032843e-196,
  7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.1067843414450286726e-196,
  7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.1067843414450286726e-196,
  3.3320377982006123631e-149, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198,
  1.3768785255608653665e-149, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198,
  3.9929888924099219388e-150, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199,
  3.9929888924099219388e-150, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199,
  1.5490398016102376505e-150, 3.4549185946116918017e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199,
  3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199,
  3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199,
  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,
  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,
  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,
  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,
  2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202,
  2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202,
  2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202,
  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  7.6922213530572229852e-156, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,
  3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, -3.2905064432040069127e-204,
  7.0002691755702864582e-157, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205,
  7.0002691755702864582e-157, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205,
  1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  4.4508689228885539715e-158, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,
  3.5387999583765925506e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.3321093418096261919e-207,
  1.2626949989038732076e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.3321093418096261919e-207,
  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,
  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,
  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,
  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,
  5.3514239183991277695e-161, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,
  1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.8567941091539589297e-193, -1.8074851186411640793e-209,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,
  2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211,
  2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211,
  2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211,
  1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756583552e-212,
  3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,
  3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,
  8.8815756978467430465e-166, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,
  8.8815756978467430465e-166, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,
  3.4549185946116918017e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217,
  7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217,
  7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217,
  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  2.0862146470760309789e-168, -1.146150630053972131e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  2.0862146470760309789e-168, -1.146150630053972131e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  1.026320681600434562e-168, 1.2072867382105631402e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,
  4.9637369886263658882e-169, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,
  2.3140020749373754342e-169, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,
  9.8913461809288020723e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,
  3.2670088967063259373e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,
  3.2670088967063259373e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,
  1.6109245756507072713e-170, -6.2044048008378732802e-187, -5.4322544592823556944e-203, 4.2491789852161138683e-219,
  7.8288241512289757055e-171, 1.2181824638728806485e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,
  3.6886133485899290404e-171, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161138683e-219,
  1.6185079472704052482e-171, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161138683e-219,
  5.8345524661064358191e-172, 6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190082842e-224,
  6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224,
  6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224,
  6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224,
  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,
  1.0095962991602958391e-175, -6.2404128071707654958e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225,
  3.7785026604276538491e-176, -6.2404128071707654958e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225,
  6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225,
  6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225,
  6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225,
  2.2493122414154495675e-177, 2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225,
  2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227,
  2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227,
  2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227,
  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,
  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,
  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,
  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,
  1.2906606599973359683e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,
  5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189355449e-211, 1.6821693549018732055e-227,
  1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756578275e-212, 6.2685154049107876715e-228,
  1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756578275e-212, 6.2685154049107876715e-228,
  3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231,
  3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231,
  1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231,
  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,
  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,
  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,
  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,
  6.0043220944823941786e-183, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,
  2.2388223052591377446e-183, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,
  3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  1.2072867382105631402e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  1.2181824638728806485e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,
  2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161132393e-219, 7.4467067939231424594e-235,
  2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161132393e-219, 7.4467067939231424594e-235,
  6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  4.4040360264865697732e-189, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  3.6409303439428119063e-190, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  1.3965175705582071936e-190, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  1.3403538552936701153e-191, 1.7826390804083638359e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,
  6.389748636109812983e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241,
  2.8828536776963681193e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241,
  1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241,
  2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 4.2560351759808952526e-241,
  2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 4.2560351759808952526e-241,
  3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242,
  3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242,
  3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242,
  6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,
  6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,
  6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,
  2.6792050150137250131e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,
  9.6685396110091013832e-196, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,
  2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224,
  2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224,
  2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224,
  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,
  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,
  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,
  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,
  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,
  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  1.9005753194802080146e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  1.9005753194802080146e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  9.3660737343905436753e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  4.5462340041847754398e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  2.1363141390818913221e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  9.3135420653044926323e-182, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  3.2887424025472810002e-182, 7.185309278132283136e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,
  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,
  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,
  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,
  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,
  8.806758170751374203e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233,
  8.806758170751374203e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233,
  4.0998834342223036605e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233,
  1.7464460659577689118e-184, 2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749095611e-233,
  5.697273818255015375e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  5.697273818255015375e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  2.755477107924346286e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  1.2845787527590117414e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  5.4912957517634446918e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  1.8140498638501083305e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  1.8140498638501083305e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  8.9473839187177424013e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  4.3508265588260719497e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  2.0525478788802367239e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  9.0340853890731911095e-188, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  3.288388689208603045e-188, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,
  4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237,
  4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237,
  4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237,
  5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237,
  5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237,
  5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237,
  1.1546040067079994973e-190, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239,
  1.1546040067079994973e-190, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239,
  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  1.4863145223629928288e-192, -7.9038076992129241506e-209, -1.609965144193984205e-224, -1.8313007053436627876e-240,
  6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224, -1.8313007053436627876e-240,
  1.712289129579509076e-193, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243,
  1.712289129579509076e-193, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243,
  6.1638445507530779946e-194, -6.0361608463951204924e-210, 1.1003018740995688645e-226, 5.827891678485165325e-243,
  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  3.418509674495068119e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  1.7061586205822532442e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  8.499830936258458068e-196, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  4.218953301476420881e-196, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,
  2.0785144840854027628e-196, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,
  1.008295075389893466e-196, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,
  4.7318537104213881764e-197, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,
  2.0563051886826149345e-197, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,
  7.185309278132283136e-198, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,
  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  1.306250843215349634e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,
  6.5304075490021959302e-201, 6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249,
  3.2643571074265457254e-201, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251,
  1.6313318866387202604e-201, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251,
  8.1481927624480752786e-202, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251,
  4.0656297104785107096e-202, 4.8431832608149701961e-218, 8.3111403472061145651e-234, 1.6001805286092554504e-249,
  2.0243481844937293316e-202, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,
  1.0037074215013384159e-202, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,
  4.9338704000514295811e-203, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,
  2.3822684925704522921e-203, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,
  1.1064675388299639308e-203, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608782288e-251,
  4.6856706195971960852e-204, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608782288e-251,
  1.4961682352459748279e-204, -8.0675475439086544798e-221, -3.6970842501441777651e-237, -5.7032870362481275794e-253,
  1.4961682352459748279e-204, -8.0675475439086544798e-221, -3.6970842501441777651e-237, -5.7032870362481275794e-253,
  6.9879263915816924805e-205, 9.6377473771091526132e-221, 1.5959741828948633012e-236, 2.7031904319843495713e-252,
  3.0010484111426663515e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254,
  1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,
  3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256,
  3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256,
  1.156336993964950812e-208, 2.7126166236326293347e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256,
  1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,
  1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,
  1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,
  6.1308251778939023781e-210, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  2.3568521170701555846e-212, -7.7818310317651142243e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  1.1686698881356804311e-212, 1.8601114328504743806e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  5.7457877366844311816e-213, 5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261,
  2.7753321643482446169e-213, -1.1860946916976500828e-229, 6.3146909508553973881e-246, 1.2573885592501532045e-261,
  1.290104378180150675e-213, 2.1117734783360818049e-229, 4.2928382696354204061e-245, -2.8075477999879273582e-261,
  5.4749048509610403382e-214, 4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501532045e-261,
  1.7618353855408067201e-214, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,
  1.7618353855408067201e-214, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,
  8.3356801918574821257e-215, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,
  3.6943433600821895879e-215, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,
  1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,
  2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,
  2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,
  2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,
  6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267,
  6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267,
  3.2038516259498326923e-217, -1.1817449557784924788e-233, -6.3454186796659920093e-250, -2.6436684620390282645e-267,
  1.3908294260376086421e-217, 2.8439730252197153919e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267,
  4.8431832608149701961e-218, 8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267,
  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,
  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,
  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,
  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,
  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,
  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,
  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,
  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,
  9.6377473771091526132e-221, 1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844372114e-268,
  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,
  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,
  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,
  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,
  2.318094503184431479e-222, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  2.318094503184431479e-222, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  9.3486833747991514629e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  7.0351983914592419146e-224, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  7.0351983914592419146e-224, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539155726e-272,
  2.7126166236326293347e-224, -1.8313007053436625212e-240, -2.3341145329525056675e-256, -2.0046830753539155726e-272,
  5.5132573971932232487e-225, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273,
  5.5132573971932232487e-225, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273,
  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  2.560476225709334075e-227, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  2.560476225709334075e-227, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,
  4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  1.8601114328504743806e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  2.1117734783360818049e-229, 4.2928382696354204061e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,
  4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277,
  4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277,
  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,
  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,
  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,
  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,
  2.4841276986611042098e-231, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,
  1.1958979447416775482e-231, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,
  5.5178306778196421733e-232, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,
  2.2972562930210755192e-232, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,
  6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  2.8439730252197153919e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  3.2789928709583552854e-234, 4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284,
  1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284,
  1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284,
  1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284,
  5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, -4.3807022524130141006e-284,
  1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284,
  1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284,
  6.1313287894022281692e-237, 5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006739096e-285,
  1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,
  1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,
  1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,
  6.0284645465737476297e-238, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,
  2.9570854717154947523e-238, 4.3456134301905148502e-254, 6.3684349745470443788e-270, -9.5347405022956042207e-287,
  1.4213959342863689955e-238, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956042207e-287,
  6.5355116557180594664e-239, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956042207e-287,
  2.6962878121452450746e-239, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,
  7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,
  7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,
  2.9677290991223565342e-240, -2.3341145329525056675e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,
  5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,
  5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,
  5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,
  2.6827483411022054912e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,
  1.1830515272065748694e-241, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,
  4.3320312025875939195e-242, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,
  5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,
  5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,
  5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,
  1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,
  1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,
  1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,
  5.5552006713333735927e-244, 7.8491179384773690214e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,
  2.6261053316934700345e-244, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997740506e-292,
  1.1615576618735179302e-244, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997740506e-292,
  4.2928382696354204061e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 2.8287088295287585094e-294,
  6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294,
  6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294,
  6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294,
  1.7379794826680480784e-246, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294,
  1.7379794826680480784e-246, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294,
  5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580810531e-280, 8.8634899828990930877e-296,
  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,
  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,
  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,
  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,
  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,
  3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  6.3808880963355377617e-251, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  6.3808880963355377617e-251, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  2.8891343516857640937e-251, 5.1095823452235464813e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  1.1432574793608780349e-251, 1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,
  2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300,
  2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300,
  5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,
  5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,
  5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,
  2.4805108027747776379e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,
  1.1165444962709601017e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,
  4.3456134301905148502e-254, 6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302,
  9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302,
  9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302,
  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,
  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,
  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,
  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,
  2.9938788518280315834e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,
  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,
  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,
  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,
  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,
  1.6338236616337094706e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,
  8.0132469526175071002e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306,
  3.850752120757712373e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306,
  1.7695047048278150093e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306,
  7.2888099686286655858e-259, 5.581381609158630475e-275, 6.1155422068568946933e-291, 1.0380272777574237546e-306,
  2.0856914288039227544e-259, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308,
  2.0856914288039227544e-259, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308,
  7.8491179384773690214e-260, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308,
  1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308,
  1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308,
  1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308,
  5.3223249184882342185e-261, -1.472095602234059958e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310,
  1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310,
  1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310,
  2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310,
  2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310,
  2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310,
  1.1412520821444306741e-262, -6.1787496089661820348e-279, -3.028042329852615431e-295, -2.182740474438892116e-311,
  5.0610577601348040988e-263, 7.9243314524777990283e-279, -3.028042329852615431e-295, -2.182740474438892116e-311,
  1.8853262294800541881e-263, 8.7279092175580810531e-280, 8.8634899828990930877e-296, -9.8167844904532653004e-314,
  2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  9.8977243486757054781e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  9.8977243486757054781e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  4.9356438320276576408e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  2.4546035737036337221e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  1.2140834445416214873e-265, 1.8893435613692150014e-281, 3.0075895258731974416e-297, -9.8167844904532653004e-314,
  5.9382337996061564537e-266, 5.1208955146257653156e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  2.8369334767011265554e-266, 5.1208955146257653156e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  1.2862833152486119506e-266, 1.6777604898591683764e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,
  5.1095823452235464813e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,
  1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,
  1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,
  2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,
  2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,
  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,
  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,
  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,
  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,
  6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319,
  6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319,
  2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319,
  6.8978448094652555593e-271, 1.1480487920352081009e-286, 7.5257037990230704094e-303, 3.6369654387311681856e-319,
  6.8978448094652555593e-271, 1.1480487920352081009e-286, 7.5257037990230704094e-303, 3.6369654387311681856e-319,
  2.1656360647981577662e-271, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,
  2.1656360647981577662e-271, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,
  9.825838786313830552e-272, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,
  3.9105778554799569972e-272, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,
  9.5294739006302120482e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
  9.5294739006302120482e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
  2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
  2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,
  2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323,
};

NOEXPORT ALIGNED(64) const float Sleef_rempitabsp[] = {
  0.159154892, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22,
  0.03415493667, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24,
  0.03415493667, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24,
  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,
  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,
  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,
  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,
  0.0009518179577, 1.342109202e-10, 1.791623576e-17, 1.518506657e-24,
  0.0009518179577, 1.342109202e-10, 1.791623576e-17, 1.518506657e-24,
  0.0004635368241, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25,
  0.0002193961991, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25,
  9.73258866e-05, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25,
  3.62907449e-05, 3.243700447e-12, 5.690024473e-19, 7.09405479e-26,
  5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26,
  5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26,
  5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26,
  1.958472239e-06, 5.152167755e-13, 1.3532163e-19, 1.92417627e-26,
  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  2.132179588e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,
  6.420638243e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,
  6.420638243e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,
  2.695347945e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,
  8.327027956e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,
  8.327027956e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,
  3.670415083e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,
  1.342109202e-10, 1.791623576e-17, 1.518506361e-24, 2.613904e-31,
  1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32,
  1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32,
  1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32,
  3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32,
  3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32,
  3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32,
  1.424711477e-12, 1.3532163e-19, 1.924175961e-26, 2.545416018e-33,
  5.152167755e-13, 1.3532163e-19, 1.924175961e-26, 2.545416018e-33,
  6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,
  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,
  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,
  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,
  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,
  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,
  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,
  1.791623576e-17, 1.518506361e-24, 2.61390353e-31, 4.764937743e-38,
  1.791623576e-17, 1.518506361e-24, 2.61390353e-31, 4.764937743e-38,
  4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 6.296048013e-40,
  4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 6.296048013e-40,
  5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40,
  5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40,
  5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40,
  1.3532163e-19, 1.924175961e-26, 2.545415467e-33, 6.296048013e-40,
  1.3532163e-19, 1.924175961e-26, 2.545415467e-33, 6.296048013e-40,
  2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,
  2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,
  2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,
  1.334890502e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,
  6.572641438e-21, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,
  0.05874381959, 1.222115387e-08, 7.693612965e-16, 1.792054435e-22,
  0.02749382704, 4.77057327e-09, 7.693612965e-16, 1.792054435e-22,
  0.01186883077, 1.045283415e-09, 3.252721926e-16, 7.332633139e-23,
  0.00405633077, 1.045283415e-09, 3.252721926e-16, 7.332633139e-23,
  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,
  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,
  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,
  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,
  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,
  2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25,
  2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25,
  2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25,
  1.275271279e-05, 1.183823005e-12, 1.161414894e-20, 1.291319272e-27,
  5.12331826e-06, 1.183823005e-12, 1.161414894e-20, 1.291319272e-27,
  1.308621904e-06, 2.743283031e-13, 1.161414894e-20, 1.291319272e-27,
  1.308621904e-06, 2.743283031e-13, 1.161414894e-20, 1.291319272e-27,
  3.549478151e-07, 4.695462769e-14, 1.161414894e-20, 1.291319272e-27,
  3.549478151e-07, 4.695462769e-14, 1.161414894e-20, 1.291319272e-27,
  1.165292645e-07, 1.853292503e-14, 4.837885366e-21, 1.291319272e-27,
  1.165292645e-07, 1.853292503e-14, 4.837885366e-21, 1.291319272e-27,
  5.69246339e-08, 4.322073705e-15, 1.449754789e-21, 7.962890365e-29,
  2.712231151e-08, 4.322073705e-15, 1.449754789e-21, 7.962890365e-29,
  1.222115387e-08, 7.693612965e-16, 1.792054182e-22, 2.91418027e-29,
  4.77057327e-09, 7.693612965e-16, 1.792054182e-22, 2.91418027e-29,
  1.045283415e-09, 3.252721926e-16, 7.332632508e-23, 3.898253736e-30,
  1.045283415e-09, 3.252721926e-16, 7.332632508e-23, 3.898253736e-30,
  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,
  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,
  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,
  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,
  5.575349904e-11, 6.083145782e-18, 5.344349223e-25, 1.511644828e-31,
  2.664967552e-11, -8.557475018e-19, -8.595036458e-26, -2.139883875e-32,
  1.209775682e-11, 2.61369883e-18, 5.344349223e-25, 1.511644828e-31,
  4.821800945e-12, 8.789757674e-19, 1.208447639e-25, 3.253064536e-33,
  1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
  1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
  2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,
};


================================================
FILE: src/rename.h
================================================
#ifndef RENAMESCALAR_H
               #define RENAMESCALAR_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions scalar */

                   
                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_scalar_f64
#define xsinf nsimd_sleef_sin_u35d_scalar_f32
#define xcos nsimd_sleef_cos_u35d_scalar_f64
#define xcosf nsimd_sleef_cos_u35d_scalar_f32
#define xsincos nsimd_sleef_sincos_u35d_scalar_f64
#define xsincosf nsimd_sleef_sincos_u35d_scalar_f32
#define xtan nsimd_sleef_tan_u35d_scalar_f64
#define xtanf nsimd_sleef_tan_u35d_scalar_f32
#define xasin nsimd_sleef_asin_u35d_scalar_f64
#define xasinf nsimd_sleef_asin_u35d_scalar_f32
#define xacos nsimd_sleef_acos_u35d_scalar_f64
#define xacosf nsimd_sleef_acos_u35d_scalar_f32
#define xatan nsimd_sleef_atan_u35d_scalar_f64
#define xatanf nsimd_sleef_atan_u35d_scalar_f32
#define xatan2 nsimd_sleef_atan2_u35d_scalar_f64
#define xatan2f nsimd_sleef_atan2_u35d_scalar_f32
#define xlog nsimd_sleef_log_u35d_scalar_f64
#define xlogf nsimd_sleef_log_u35d_scalar_f32
#define xcbrt nsimd_sleef_cbrt_u35d_scalar_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_scalar_f32
#define xsin_u1 nsimd_sleef_sin_u10d_scalar_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_scalar_f32
#define xcos_u1 nsimd_sleef_cos_u10d_scalar_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_scalar_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_scalar_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_scalar_f32
#define xtan_u1 nsimd_sleef_tan_u10d_scalar_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_scalar_f32
#define xasin_u1 nsimd_sleef_asin_u10d_scalar_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_scalar_f32
#define xacos_u1 nsimd_sleef_acos_u10d_scalar_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_scalar_f32
#define xatan_u1 nsimd_sleef_atan_u10d_scalar_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_scalar_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_scalar_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_scalar_f32
#define xlog_u1 nsimd_sleef_log_u10d_scalar_f64
#define xlogf_u1 nsimd_sleef_log_u10d_scalar_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_scalar_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_scalar_f32
#define xexp nsimd_sleef_exp_u10d_scalar_f64
#define xexpf nsimd_sleef_exp_u10d_scalar_f32
#define xpow nsimd_sleef_pow_u10d_scalar_f64
#define xpowf nsimd_sleef_pow_u10d_scalar_f32
#define xsinh nsimd_sleef_sinh_u10d_scalar_f64
#define xsinhf nsimd_sleef_sinh_u10d_scalar_f32
#define xcosh nsimd_sleef_cosh_u10d_scalar_f64
#define xcoshf nsimd_sleef_cosh_u10d_scalar_f32
#define xtanh nsimd_sleef_tanh_u10d_scalar_f64
#define xtanhf nsimd_sleef_tanh_u10d_scalar_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_scalar_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_scalar_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_scalar_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_scalar_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_scalar_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_scalar_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_scalar_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_scalar_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_scalar_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_scalar_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_scalar_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_scalar_f32
#define xasinh nsimd_sleef_asinh_u10d_scalar_f64
#define xasinhf nsimd_sleef_asinh_u10d_scalar_f32
#define xacosh nsimd_sleef_acosh_u10d_scalar_f64
#define xacoshf nsimd_sleef_acosh_u10d_scalar_f32
#define xatanh nsimd_sleef_atanh_u10d_scalar_f64
#define xatanhf nsimd_sleef_atanh_u10d_scalar_f32
#define xexp2 nsimd_sleef_exp2_u10d_scalar_f64
#define xexp2f nsimd_sleef_exp2_u10d_scalar_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_scalar_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_scalar_f32
#define xexp10 nsimd_sleef_exp10_u10d_scalar_f64
#define xexp10f nsimd_sleef_exp10_u10d_scalar_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_scalar_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_scalar_f32
#define xexpm1 nsimd_sleef_expm1_u10d_scalar_f64
#define xexpm1f nsimd_sleef_expm1_u10d_scalar_f32
#define xlog10 nsimd_sleef_log10_u10d_scalar_f64
#define xlog10f nsimd_sleef_log10_u10d_scalar_f32
#define xlog2 nsimd_sleef_log2_u10d_scalar_f64
#define xlog2f nsimd_sleef_log2_u10d_scalar_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_scalar_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_scalar_f32
#define xlog1p nsimd_sleef_log1p_u10d_scalar_f64
#define xlog1pf nsimd_sleef_log1p_u10d_scalar_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_scalar_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_scalar_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_scalar_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_scalar_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_scalar_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_scalar_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_scalar_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_scalar_f32
#define xldexp nsimd_sleef_ldexp_scalar_f64
#define xldexpf nsimd_sleef_ldexp_scalar_f32
#define xilogb nsimd_sleef_ilogb_scalar_f64
#define xilogbf nsimd_sleef_ilogb_scalar_f32
#define xfma nsimd_sleef_fma_scalar_f64
#define xfmaf nsimd_sleef_fma_scalar_f32
#define xsqrt nsimd_sleef_sqrt_scalar_f64
#define xsqrtf nsimd_sleef_sqrt_scalar_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_scalar_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_scalar_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_scalar_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_scalar_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_scalar_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_scalar_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_scalar_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_scalar_f32
#define xfabs nsimd_sleef_fabs_scalar_f64
#define xfabsf nsimd_sleef_fabs_scalar_f32
#define xcopysign nsimd_sleef_copysign_scalar_f64
#define xcopysignf nsimd_sleef_copysign_scalar_f32
#define xfmax nsimd_sleef_fmax_scalar_f64
#define xfmaxf nsimd_sleef_fmax_scalar_f32
#define xfmin nsimd_sleef_fmin_scalar_f64
#define xfminf nsimd_sleef_fmin_scalar_f32
#define xfdim nsimd_sleef_fdim_scalar_f64
#define xfdimf nsimd_sleef_fdim_scalar_f32
#define xtrunc nsimd_sleef_trunc_scalar_f64
#define xtruncf nsimd_sleef_trunc_scalar_f32
#define xfloor nsimd_sleef_floor_scalar_f64
#define xfloorf nsimd_sleef_floor_scalar_f32
#define xceil nsimd_sleef_ceil_scalar_f64
#define xceilf nsimd_sleef_ceil_scalar_f32
#define xround nsimd_sleef_round_scalar_f64
#define xroundf nsimd_sleef_round_scalar_f32
#define xrint nsimd_sleef_rint_scalar_f64
#define xrintf nsimd_sleef_rint_scalar_f32
#define xnextafter nsimd_sleef_nextafter_scalar_f64
#define xnextafterf nsimd_sleef_nextafter_scalar_f32
#define xfrfrexp nsimd_sleef_frfrexp_scalar_f64
#define xfrfrexpf nsimd_sleef_frfrexp_scalar_f32
#define xexpfrexp nsimd_sleef_expfrexp_scalar_f64
#define xexpfrexpf nsimd_sleef_expfrexp_scalar_f32
#define xfmod nsimd_sleef_fmod_scalar_f64
#define xfmodf nsimd_sleef_fmod_scalar_f32
#define xremainder nsimd_sleef_remainder_scalar_f64
#define xremainderf nsimd_sleef_remainder_scalar_f32
#define xmodf nsimd_sleef_modf_scalar_f64
#define xmodff nsimd_sleef_modf_scalar_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_scalar_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_scalar_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_scalar_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_scalar_f32
#define xerf_u1 nsimd_sleef_erf_u10d_scalar_f64
#define xerff_u1 nsimd_sleef_erf_u10d_scalar_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_scalar_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_scalar_f32
#define xgetInt nsimd_sleef_getInt_scalar_f64
#define xgetIntf nsimd_sleef_getInt_scalar_f32
#define xgetPtr nsimd_sleef_getPtr_scalar_f64
#define xgetPtrf nsimd_sleef_getPtr_scalar_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_scalar_f64
#define xsinf nsimd_sleef_sin_u35_scalar_f32
#define xcos nsimd_sleef_cos_u35_scalar_f64
#define xcosf nsimd_sleef_cos_u35_scalar_f32
#define xsincos nsimd_sleef_sincos_u35_scalar_f64
#define xsincosf nsimd_sleef_sincos_u35_scalar_f32
#define xtan nsimd_sleef_tan_u35_scalar_f64
#define xtanf nsimd_sleef_tan_u35_scalar_f32
#define xasin nsimd_sleef_asin_u35_scalar_f64
#define xasinf nsimd_sleef_asin_u35_scalar_f32
#define xacos nsimd_sleef_acos_u35_scalar_f64
#define xacosf nsimd_sleef_acos_u35_scalar_f32
#define xatan nsimd_sleef_atan_u35_scalar_f64
#define xatanf nsimd_sleef_atan_u35_scalar_f32
#define xatan2 nsimd_sleef_atan2_u35_scalar_f64
#define xatan2f nsimd_sleef_atan2_u35_scalar_f32
#define xlog nsimd_sleef_log_u35_scalar_f64
#define xlogf nsimd_sleef_log_u35_scalar_f32
#define xcbrt nsimd_sleef_cbrt_u35_scalar_f64
#define xcbrtf nsimd_sleef_cbrt_u35_scalar_f32
#define xsin_u1 nsimd_sleef_sin_u10_scalar_f64
#define xsinf_u1 nsimd_sleef_sin_u10_scalar_f32
#define xcos_u1 nsimd_sleef_cos_u10_scalar_f64
#define xcosf_u1 nsimd_sleef_cos_u10_scalar_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_scalar_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_scalar_f32
#define xtan_u1 nsimd_sleef_tan_u10_scalar_f64
#define xtanf_u1 nsimd_sleef_tan_u10_scalar_f32
#define xasin_u1 nsimd_sleef_asin_u10_scalar_f64
#define xasinf_u1 nsimd_sleef_asin_u10_scalar_f32
#define xacos_u1 nsimd_sleef_acos_u10_scalar_f64
#define xacosf_u1 nsimd_sleef_acos_u10_scalar_f32
#define xatan_u1 nsimd_sleef_atan_u10_scalar_f64
#define xatanf_u1 nsimd_sleef_atan_u10_scalar_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_scalar_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_scalar_f32
#define xlog_u1 nsimd_sleef_log_u10_scalar_f64
#define xlogf_u1 nsimd_sleef_log_u10_scalar_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_scalar_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_scalar_f32
#define xexp nsimd_sleef_exp_u10_scalar_f64
#define xexpf nsimd_sleef_exp_u10_scalar_f32
#define xpow nsimd_sleef_pow_u10_scalar_f64
#define xpowf nsimd_sleef_pow_u10_scalar_f32
#define xsinh nsimd_sleef_sinh_u10_scalar_f64
#define xsinhf nsimd_sleef_sinh_u10_scalar_f32
#define xcosh nsimd_sleef_cosh_u10_scalar_f64
#define xcoshf nsimd_sleef_cosh_u10_scalar_f32
#define xtanh nsimd_sleef_tanh_u10_scalar_f64
#define xtanhf nsimd_sleef_tanh_u10_scalar_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_scalar_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_scalar_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_scalar_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_scalar_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_scalar_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_scalar_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_scalar_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_scalar_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_scalar_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_scalar_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_scalar_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_scalar_f32
#define xasinh nsimd_sleef_asinh_u10_scalar_f64
#define xasinhf nsimd_sleef_asinh_u10_scalar_f32
#define xacosh nsimd_sleef_acosh_u10_scalar_f64
#define xacoshf nsimd_sleef_acosh_u10_scalar_f32
#define xatanh nsimd_sleef_atanh_u10_scalar_f64
#define xatanhf nsimd_sleef_atanh_u10_scalar_f32
#define xexp2 nsimd_sleef_exp2_u10_scalar_f64
#define xexp2f nsimd_sleef_exp2_u10_scalar_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_scalar_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_scalar_f32
#define xexp10 nsimd_sleef_exp10_u10_scalar_f64
#define xexp10f nsimd_sleef_exp10_u10_scalar_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_scalar_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_scalar_f32
#define xexpm1 nsimd_sleef_expm1_u10_scalar_f64
#define xexpm1f nsimd_sleef_expm1_u10_scalar_f32
#define xlog10 nsimd_sleef_log10_u10_scalar_f64
#define xlog10f nsimd_sleef_log10_u10_scalar_f32
#define xlog2 nsimd_sleef_log2_u10_scalar_f64
#define xlog2f nsimd_sleef_log2_u10_scalar_f32
#define xlog2_u35 nsimd_sleef_log2_u35_scalar_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_scalar_f32
#define xlog1p nsimd_sleef_log1p_u10_scalar_f64
#define xlog1pf nsimd_sleef_log1p_u10_scalar_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_scalar_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_scalar_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_scalar_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_scalar_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_scalar_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_scalar_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_scalar_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_scalar_f32
#define xldexp nsimd_sleef_ldexp_scalar_f64
#define xldexpf nsimd_sleef_ldexp_scalar_f32
#define xilogb nsimd_sleef_ilogb_scalar_f64
#define xilogbf nsimd_sleef_ilogb_scalar_f32
#define xfma nsimd_sleef_fma_scalar_f64
#define xfmaf nsimd_sleef_fma_scalar_f32
#define xsqrt nsimd_sleef_sqrt_scalar_f64
#define xsqrtf nsimd_sleef_sqrt_scalar_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_scalar_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_scalar_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_scalar_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_scalar_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_scalar_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_scalar_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_scalar_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_scalar_f32
#define xfabs nsimd_sleef_fabs_scalar_f64
#define xfabsf nsimd_sleef_fabs_scalar_f32
#define xcopysign nsimd_sleef_copysign_scalar_f64
#define xcopysignf nsimd_sleef_copysign_scalar_f32
#define xfmax nsimd_sleef_fmax_scalar_f64
#define xfmaxf nsimd_sleef_fmax_scalar_f32
#define xfmin nsimd_sleef_fmin_scalar_f64
#define xfminf nsimd_sleef_fmin_scalar_f32
#define xfdim nsimd_sleef_fdim_scalar_f64
#define xfdimf nsimd_sleef_fdim_scalar_f32
#define xtrunc nsimd_sleef_trunc_scalar_f64
#define xtruncf nsimd_sleef_trunc_scalar_f32
#define xfloor nsimd_sleef_floor_scalar_f64
#define xfloorf nsimd_sleef_floor_scalar_f32
#define xceil nsimd_sleef_ceil_scalar_f64
#define xceilf nsimd_sleef_ceil_scalar_f32
#define xround nsimd_sleef_round_scalar_f64
#define xroundf nsimd_sleef_round_scalar_f32
#define xrint nsimd_sleef_rint_scalar_f64
#define xrintf nsimd_sleef_rint_scalar_f32
#define xnextafter nsimd_sleef_nextafter_scalar_f64
#define xnextafterf nsimd_sleef_nextafter_scalar_f32
#define xfrfrexp nsimd_sleef_frfrexp_scalar_f64
#define xfrfrexpf nsimd_sleef_frfrexp_scalar_f32
#define xexpfrexp nsimd_sleef_expfrexp_scalar_f64
#define xexpfrexpf nsimd_sleef_expfrexp_scalar_f32
#define xfmod nsimd_sleef_fmod_scalar_f64
#define xfmodf nsimd_sleef_fmod_scalar_f32
#define xremainder nsimd_sleef_remainder_scalar_f64
#define xremainderf nsimd_sleef_remainder_scalar_f32
#define xmodf nsimd_sleef_modf_scalar_f64
#define xmodff nsimd_sleef_modf_scalar_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_scalar_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_scalar_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_scalar_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_scalar_f32
#define xerf_u1 nsimd_sleef_erf_u10_scalar_f64
#define xerff_u1 nsimd_sleef_erf_u10_scalar_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_scalar_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_scalar_f32
#define xgetInt nsimd_sleef_getInt_scalar_f64
#define xgetIntf nsimd_sleef_getInt_scalar_f32
#define xgetPtr nsimd_sleef_getPtr_scalar_f64
#define xgetPtrf nsimd_sleef_getPtr_scalar_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_scalar
                   #define rempif nsimd_sleef_rempif_scalar
                   #define rempisub nsimd_sleef_rempisub_scalar
                   #define rempisubf nsimd_sleef_rempisubf_scalar
                   #define gammak nsimd_gammak_scalar
                   #define gammafk nsimd_gammafk_scalar

                   
#endif


================================================
FILE: src/renameadvsimd.h
================================================
#ifndef RENAMEADVSIMD_H
               #define RENAMEADVSIMD_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions aarch64 */

                   #ifdef NSIMD_AARCH64

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_aarch64_f64
#define xsinf nsimd_sleef_sin_u35d_aarch64_f32
#define xcos nsimd_sleef_cos_u35d_aarch64_f64
#define xcosf nsimd_sleef_cos_u35d_aarch64_f32
#define xsincos nsimd_sleef_sincos_u35d_aarch64_f64
#define xsincosf nsimd_sleef_sincos_u35d_aarch64_f32
#define xtan nsimd_sleef_tan_u35d_aarch64_f64
#define xtanf nsimd_sleef_tan_u35d_aarch64_f32
#define xasin nsimd_sleef_asin_u35d_aarch64_f64
#define xasinf nsimd_sleef_asin_u35d_aarch64_f32
#define xacos nsimd_sleef_acos_u35d_aarch64_f64
#define xacosf nsimd_sleef_acos_u35d_aarch64_f32
#define xatan nsimd_sleef_atan_u35d_aarch64_f64
#define xatanf nsimd_sleef_atan_u35d_aarch64_f32
#define xatan2 nsimd_sleef_atan2_u35d_aarch64_f64
#define xatan2f nsimd_sleef_atan2_u35d_aarch64_f32
#define xlog nsimd_sleef_log_u35d_aarch64_f64
#define xlogf nsimd_sleef_log_u35d_aarch64_f32
#define xcbrt nsimd_sleef_cbrt_u35d_aarch64_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_aarch64_f32
#define xsin_u1 nsimd_sleef_sin_u10d_aarch64_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_aarch64_f32
#define xcos_u1 nsimd_sleef_cos_u10d_aarch64_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_aarch64_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_aarch64_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_aarch64_f32
#define xtan_u1 nsimd_sleef_tan_u10d_aarch64_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_aarch64_f32
#define xasin_u1 nsimd_sleef_asin_u10d_aarch64_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_aarch64_f32
#define xacos_u1 nsimd_sleef_acos_u10d_aarch64_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_aarch64_f32
#define xatan_u1 nsimd_sleef_atan_u10d_aarch64_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_aarch64_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_aarch64_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_aarch64_f32
#define xlog_u1 nsimd_sleef_log_u10d_aarch64_f64
#define xlogf_u1 nsimd_sleef_log_u10d_aarch64_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_aarch64_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_aarch64_f32
#define xexp nsimd_sleef_exp_u10d_aarch64_f64
#define xexpf nsimd_sleef_exp_u10d_aarch64_f32
#define xpow nsimd_sleef_pow_u10d_aarch64_f64
#define xpowf nsimd_sleef_pow_u10d_aarch64_f32
#define xsinh nsimd_sleef_sinh_u10d_aarch64_f64
#define xsinhf nsimd_sleef_sinh_u10d_aarch64_f32
#define xcosh nsimd_sleef_cosh_u10d_aarch64_f64
#define xcoshf nsimd_sleef_cosh_u10d_aarch64_f32
#define xtanh nsimd_sleef_tanh_u10d_aarch64_f64
#define xtanhf nsimd_sleef_tanh_u10d_aarch64_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_aarch64_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_aarch64_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_aarch64_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_aarch64_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_aarch64_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_aarch64_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_aarch64_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_aarch64_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_aarch64_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_aarch64_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_aarch64_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_aarch64_f32
#define xasinh nsimd_sleef_asinh_u10d_aarch64_f64
#define xasinhf nsimd_sleef_asinh_u10d_aarch64_f32
#define xacosh nsimd_sleef_acosh_u10d_aarch64_f64
#define xacoshf nsimd_sleef_acosh_u10d_aarch64_f32
#define xatanh nsimd_sleef_atanh_u10d_aarch64_f64
#define xatanhf nsimd_sleef_atanh_u10d_aarch64_f32
#define xexp2 nsimd_sleef_exp2_u10d_aarch64_f64
#define xexp2f nsimd_sleef_exp2_u10d_aarch64_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_aarch64_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_aarch64_f32
#define xexp10 nsimd_sleef_exp10_u10d_aarch64_f64
#define xexp10f nsimd_sleef_exp10_u10d_aarch64_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_aarch64_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_aarch64_f32
#define xexpm1 nsimd_sleef_expm1_u10d_aarch64_f64
#define xexpm1f nsimd_sleef_expm1_u10d_aarch64_f32
#define xlog10 nsimd_sleef_log10_u10d_aarch64_f64
#define xlog10f nsimd_sleef_log10_u10d_aarch64_f32
#define xlog2 nsimd_sleef_log2_u10d_aarch64_f64
#define xlog2f nsimd_sleef_log2_u10d_aarch64_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_aarch64_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_aarch64_f32
#define xlog1p nsimd_sleef_log1p_u10d_aarch64_f64
#define xlog1pf nsimd_sleef_log1p_u10d_aarch64_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_aarch64_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_aarch64_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_aarch64_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_aarch64_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_aarch64_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_aarch64_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_aarch64_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_aarch64_f32
#define xldexp nsimd_sleef_ldexp_aarch64_f64
#define xldexpf nsimd_sleef_ldexp_aarch64_f32
#define xilogb nsimd_sleef_ilogb_aarch64_f64
#define xilogbf nsimd_sleef_ilogb_aarch64_f32
#define xfma nsimd_sleef_fma_aarch64_f64
#define xfmaf nsimd_sleef_fma_aarch64_f32
#define xsqrt nsimd_sleef_sqrt_aarch64_f64
#define xsqrtf nsimd_sleef_sqrt_aarch64_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_aarch64_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_aarch64_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_aarch64_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_aarch64_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_aarch64_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_aarch64_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_aarch64_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_aarch64_f32
#define xfabs nsimd_sleef_fabs_aarch64_f64
#define xfabsf nsimd_sleef_fabs_aarch64_f32
#define xcopysign nsimd_sleef_copysign_aarch64_f64
#define xcopysignf nsimd_sleef_copysign_aarch64_f32
#define xfmax nsimd_sleef_fmax_aarch64_f64
#define xfmaxf nsimd_sleef_fmax_aarch64_f32
#define xfmin nsimd_sleef_fmin_aarch64_f64
#define xfminf nsimd_sleef_fmin_aarch64_f32
#define xfdim nsimd_sleef_fdim_aarch64_f64
#define xfdimf nsimd_sleef_fdim_aarch64_f32
#define xtrunc nsimd_sleef_trunc_aarch64_f64
#define xtruncf nsimd_sleef_trunc_aarch64_f32
#define xfloor nsimd_sleef_floor_aarch64_f64
#define xfloorf nsimd_sleef_floor_aarch64_f32
#define xceil nsimd_sleef_ceil_aarch64_f64
#define xceilf nsimd_sleef_ceil_aarch64_f32
#define xround nsimd_sleef_round_aarch64_f64
#define xroundf nsimd_sleef_round_aarch64_f32
#define xrint nsimd_sleef_rint_aarch64_f64
#define xrintf nsimd_sleef_rint_aarch64_f32
#define xnextafter nsimd_sleef_nextafter_aarch64_f64
#define xnextafterf nsimd_sleef_nextafter_aarch64_f32
#define xfrfrexp nsimd_sleef_frfrexp_aarch64_f64
#define xfrfrexpf nsimd_sleef_frfrexp_aarch64_f32
#define xexpfrexp nsimd_sleef_expfrexp_aarch64_f64
#define xexpfrexpf nsimd_sleef_expfrexp_aarch64_f32
#define xfmod nsimd_sleef_fmod_aarch64_f64
#define xfmodf nsimd_sleef_fmod_aarch64_f32
#define xremainder nsimd_sleef_remainder_aarch64_f64
#define xremainderf nsimd_sleef_remainder_aarch64_f32
#define xmodf nsimd_sleef_modf_aarch64_f64
#define xmodff nsimd_sleef_modf_aarch64_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_aarch64_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_aarch64_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_aarch64_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_aarch64_f32
#define xerf_u1 nsimd_sleef_erf_u10d_aarch64_f64
#define xerff_u1 nsimd_sleef_erf_u10d_aarch64_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_aarch64_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_aarch64_f32
#define xgetInt nsimd_sleef_getInt_aarch64_f64
#define xgetIntf nsimd_sleef_getInt_aarch64_f32
#define xgetPtr nsimd_sleef_getPtr_aarch64_f64
#define xgetPtrf nsimd_sleef_getPtr_aarch64_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_aarch64_f64
#define xsinf nsimd_sleef_sin_u35_aarch64_f32
#define xcos nsimd_sleef_cos_u35_aarch64_f64
#define xcosf nsimd_sleef_cos_u35_aarch64_f32
#define xsincos nsimd_sleef_sincos_u35_aarch64_f64
#define xsincosf nsimd_sleef_sincos_u35_aarch64_f32
#define xtan nsimd_sleef_tan_u35_aarch64_f64
#define xtanf nsimd_sleef_tan_u35_aarch64_f32
#define xasin nsimd_sleef_asin_u35_aarch64_f64
#define xasinf nsimd_sleef_asin_u35_aarch64_f32
#define xacos nsimd_sleef_acos_u35_aarch64_f64
#define xacosf nsimd_sleef_acos_u35_aarch64_f32
#define xatan nsimd_sleef_atan_u35_aarch64_f64
#define xatanf nsimd_sleef_atan_u35_aarch64_f32
#define xatan2 nsimd_sleef_atan2_u35_aarch64_f64
#define xatan2f nsimd_sleef_atan2_u35_aarch64_f32
#define xlog nsimd_sleef_log_u35_aarch64_f64
#define xlogf nsimd_sleef_log_u35_aarch64_f32
#define xcbrt nsimd_sleef_cbrt_u35_aarch64_f64
#define xcbrtf nsimd_sleef_cbrt_u35_aarch64_f32
#define xsin_u1 nsimd_sleef_sin_u10_aarch64_f64
#define xsinf_u1 nsimd_sleef_sin_u10_aarch64_f32
#define xcos_u1 nsimd_sleef_cos_u10_aarch64_f64
#define xcosf_u1 nsimd_sleef_cos_u10_aarch64_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_aarch64_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_aarch64_f32
#define xtan_u1 nsimd_sleef_tan_u10_aarch64_f64
#define xtanf_u1 nsimd_sleef_tan_u10_aarch64_f32
#define xasin_u1 nsimd_sleef_asin_u10_aarch64_f64
#define xasinf_u1 nsimd_sleef_asin_u10_aarch64_f32
#define xacos_u1 nsimd_sleef_acos_u10_aarch64_f64
#define xacosf_u1 nsimd_sleef_acos_u10_aarch64_f32
#define xatan_u1 nsimd_sleef_atan_u10_aarch64_f64
#define xatanf_u1 nsimd_sleef_atan_u10_aarch64_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_aarch64_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_aarch64_f32
#define xlog_u1 nsimd_sleef_log_u10_aarch64_f64
#define xlogf_u1 nsimd_sleef_log_u10_aarch64_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_aarch64_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_aarch64_f32
#define xexp nsimd_sleef_exp_u10_aarch64_f64
#define xexpf nsimd_sleef_exp_u10_aarch64_f32
#define xpow nsimd_sleef_pow_u10_aarch64_f64
#define xpowf nsimd_sleef_pow_u10_aarch64_f32
#define xsinh nsimd_sleef_sinh_u10_aarch64_f64
#define xsinhf nsimd_sleef_sinh_u10_aarch64_f32
#define xcosh nsimd_sleef_cosh_u10_aarch64_f64
#define xcoshf nsimd_sleef_cosh_u10_aarch64_f32
#define xtanh nsimd_sleef_tanh_u10_aarch64_f64
#define xtanhf nsimd_sleef_tanh_u10_aarch64_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_aarch64_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_aarch64_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_aarch64_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_aarch64_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_aarch64_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_aarch64_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_aarch64_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_aarch64_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_aarch64_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_aarch64_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_aarch64_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_aarch64_f32
#define xasinh nsimd_sleef_asinh_u10_aarch64_f64
#define xasinhf nsimd_sleef_asinh_u10_aarch64_f32
#define xacosh nsimd_sleef_acosh_u10_aarch64_f64
#define xacoshf nsimd_sleef_acosh_u10_aarch64_f32
#define xatanh nsimd_sleef_atanh_u10_aarch64_f64
#define xatanhf nsimd_sleef_atanh_u10_aarch64_f32
#define xexp2 nsimd_sleef_exp2_u10_aarch64_f64
#define xexp2f nsimd_sleef_exp2_u10_aarch64_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_aarch64_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_aarch64_f32
#define xexp10 nsimd_sleef_exp10_u10_aarch64_f64
#define xexp10f nsimd_sleef_exp10_u10_aarch64_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_aarch64_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_aarch64_f32
#define xexpm1 nsimd_sleef_expm1_u10_aarch64_f64
#define xexpm1f nsimd_sleef_expm1_u10_aarch64_f32
#define xlog10 nsimd_sleef_log10_u10_aarch64_f64
#define xlog10f nsimd_sleef_log10_u10_aarch64_f32
#define xlog2 nsimd_sleef_log2_u10_aarch64_f64
#define xlog2f nsimd_sleef_log2_u10_aarch64_f32
#define xlog2_u35 nsimd_sleef_log2_u35_aarch64_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_aarch64_f32
#define xlog1p nsimd_sleef_log1p_u10_aarch64_f64
#define xlog1pf nsimd_sleef_log1p_u10_aarch64_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_aarch64_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_aarch64_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_aarch64_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_aarch64_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_aarch64_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_aarch64_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_aarch64_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_aarch64_f32
#define xldexp nsimd_sleef_ldexp_aarch64_f64
#define xldexpf nsimd_sleef_ldexp_aarch64_f32
#define xilogb nsimd_sleef_ilogb_aarch64_f64
#define xilogbf nsimd_sleef_ilogb_aarch64_f32
#define xfma nsimd_sleef_fma_aarch64_f64
#define xfmaf nsimd_sleef_fma_aarch64_f32
#define xsqrt nsimd_sleef_sqrt_aarch64_f64
#define xsqrtf nsimd_sleef_sqrt_aarch64_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_aarch64_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_aarch64_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_aarch64_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_aarch64_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_aarch64_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_aarch64_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_aarch64_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_aarch64_f32
#define xfabs nsimd_sleef_fabs_aarch64_f64
#define xfabsf nsimd_sleef_fabs_aarch64_f32
#define xcopysign nsimd_sleef_copysign_aarch64_f64
#define xcopysignf nsimd_sleef_copysign_aarch64_f32
#define xfmax nsimd_sleef_fmax_aarch64_f64
#define xfmaxf nsimd_sleef_fmax_aarch64_f32
#define xfmin nsimd_sleef_fmin_aarch64_f64
#define xfminf nsimd_sleef_fmin_aarch64_f32
#define xfdim nsimd_sleef_fdim_aarch64_f64
#define xfdimf nsimd_sleef_fdim_aarch64_f32
#define xtrunc nsimd_sleef_trunc_aarch64_f64
#define xtruncf nsimd_sleef_trunc_aarch64_f32
#define xfloor nsimd_sleef_floor_aarch64_f64
#define xfloorf nsimd_sleef_floor_aarch64_f32
#define xceil nsimd_sleef_ceil_aarch64_f64
#define xceilf nsimd_sleef_ceil_aarch64_f32
#define xround nsimd_sleef_round_aarch64_f64
#define xroundf nsimd_sleef_round_aarch64_f32
#define xrint nsimd_sleef_rint_aarch64_f64
#define xrintf nsimd_sleef_rint_aarch64_f32
#define xnextafter nsimd_sleef_nextafter_aarch64_f64
#define xnextafterf nsimd_sleef_nextafter_aarch64_f32
#define xfrfrexp nsimd_sleef_frfrexp_aarch64_f64
#define xfrfrexpf nsimd_sleef_frfrexp_aarch64_f32
#define xexpfrexp nsimd_sleef_expfrexp_aarch64_f64
#define xexpfrexpf nsimd_sleef_expfrexp_aarch64_f32
#define xfmod nsimd_sleef_fmod_aarch64_f64
#define xfmodf nsimd_sleef_fmod_aarch64_f32
#define xremainder nsimd_sleef_remainder_aarch64_f64
#define xremainderf nsimd_sleef_remainder_aarch64_f32
#define xmodf nsimd_sleef_modf_aarch64_f64
#define xmodff nsimd_sleef_modf_aarch64_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_aarch64_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_aarch64_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_aarch64_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_aarch64_f32
#define xerf_u1 nsimd_sleef_erf_u10_aarch64_f64
#define xerff_u1 nsimd_sleef_erf_u10_aarch64_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_aarch64_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_aarch64_f32
#define xgetInt nsimd_sleef_getInt_aarch64_f64
#define xgetIntf nsimd_sleef_getInt_aarch64_f32
#define xgetPtr nsimd_sleef_getPtr_aarch64_f64
#define xgetPtrf nsimd_sleef_getPtr_aarch64_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_aarch64
                   #define rempif nsimd_sleef_rempif_aarch64
                   #define rempisub nsimd_sleef_rempisub_aarch64
                   #define rempisubf nsimd_sleef_rempisubf_aarch64
                   #define gammak nsimd_gammak_aarch64
                   #define gammafk nsimd_gammafk_aarch64

                   #endif

                   
#endif


================================================
FILE: src/renameavx.h
================================================
#ifndef RENAMEAVX_H
               #define RENAMEAVX_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions avx */

                   #ifdef NSIMD_AVX

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_avx_f64
#define xsinf nsimd_sleef_sin_u35d_avx_f32
#define xcos nsimd_sleef_cos_u35d_avx_f64
#define xcosf nsimd_sleef_cos_u35d_avx_f32
#define xsincos nsimd_sleef_sincos_u35d_avx_f64
#define xsincosf nsimd_sleef_sincos_u35d_avx_f32
#define xtan nsimd_sleef_tan_u35d_avx_f64
#define xtanf nsimd_sleef_tan_u35d_avx_f32
#define xasin nsimd_sleef_asin_u35d_avx_f64
#define xasinf nsimd_sleef_asin_u35d_avx_f32
#define xacos nsimd_sleef_acos_u35d_avx_f64
#define xacosf nsimd_sleef_acos_u35d_avx_f32
#define xatan nsimd_sleef_atan_u35d_avx_f64
#define xatanf nsimd_sleef_atan_u35d_avx_f32
#define xatan2 nsimd_sleef_atan2_u35d_avx_f64
#define xatan2f nsimd_sleef_atan2_u35d_avx_f32
#define xlog nsimd_sleef_log_u35d_avx_f64
#define xlogf nsimd_sleef_log_u35d_avx_f32
#define xcbrt nsimd_sleef_cbrt_u35d_avx_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_avx_f32
#define xsin_u1 nsimd_sleef_sin_u10d_avx_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_avx_f32
#define xcos_u1 nsimd_sleef_cos_u10d_avx_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_avx_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_avx_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx_f32
#define xtan_u1 nsimd_sleef_tan_u10d_avx_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_avx_f32
#define xasin_u1 nsimd_sleef_asin_u10d_avx_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_avx_f32
#define xacos_u1 nsimd_sleef_acos_u10d_avx_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_avx_f32
#define xatan_u1 nsimd_sleef_atan_u10d_avx_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_avx_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_avx_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx_f32
#define xlog_u1 nsimd_sleef_log_u10d_avx_f64
#define xlogf_u1 nsimd_sleef_log_u10d_avx_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx_f32
#define xexp nsimd_sleef_exp_u10d_avx_f64
#define xexpf nsimd_sleef_exp_u10d_avx_f32
#define xpow nsimd_sleef_pow_u10d_avx_f64
#define xpowf nsimd_sleef_pow_u10d_avx_f32
#define xsinh nsimd_sleef_sinh_u10d_avx_f64
#define xsinhf nsimd_sleef_sinh_u10d_avx_f32
#define xcosh nsimd_sleef_cosh_u10d_avx_f64
#define xcoshf nsimd_sleef_cosh_u10d_avx_f32
#define xtanh nsimd_sleef_tanh_u10d_avx_f64
#define xtanhf nsimd_sleef_tanh_u10d_avx_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_avx_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_avx_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_avx_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx_f32
#define xasinh nsimd_sleef_asinh_u10d_avx_f64
#define xasinhf nsimd_sleef_asinh_u10d_avx_f32
#define xacosh nsimd_sleef_acosh_u10d_avx_f64
#define xacoshf nsimd_sleef_acosh_u10d_avx_f32
#define xatanh nsimd_sleef_atanh_u10d_avx_f64
#define xatanhf nsimd_sleef_atanh_u10d_avx_f32
#define xexp2 nsimd_sleef_exp2_u10d_avx_f64
#define xexp2f nsimd_sleef_exp2_u10d_avx_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_avx_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx_f32
#define xexp10 nsimd_sleef_exp10_u10d_avx_f64
#define xexp10f nsimd_sleef_exp10_u10d_avx_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_avx_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx_f32
#define xexpm1 nsimd_sleef_expm1_u10d_avx_f64
#define xexpm1f nsimd_sleef_expm1_u10d_avx_f32
#define xlog10 nsimd_sleef_log10_u10d_avx_f64
#define xlog10f nsimd_sleef_log10_u10d_avx_f32
#define xlog2 nsimd_sleef_log2_u10d_avx_f64
#define xlog2f nsimd_sleef_log2_u10d_avx_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_avx_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_avx_f32
#define xlog1p nsimd_sleef_log1p_u10d_avx_f64
#define xlog1pf nsimd_sleef_log1p_u10d_avx_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_avx_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_avx_f32
#define xldexp nsimd_sleef_ldexp_avx_f64
#define xldexpf nsimd_sleef_ldexp_avx_f32
#define xilogb nsimd_sleef_ilogb_avx_f64
#define xilogbf nsimd_sleef_ilogb_avx_f32
#define xfma nsimd_sleef_fma_avx_f64
#define xfmaf nsimd_sleef_fma_avx_f32
#define xsqrt nsimd_sleef_sqrt_avx_f64
#define xsqrtf nsimd_sleef_sqrt_avx_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_avx_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_avx_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx_f32
#define xfabs nsimd_sleef_fabs_avx_f64
#define xfabsf nsimd_sleef_fabs_avx_f32
#define xcopysign nsimd_sleef_copysign_avx_f64
#define xcopysignf nsimd_sleef_copysign_avx_f32
#define xfmax nsimd_sleef_fmax_avx_f64
#define xfmaxf nsimd_sleef_fmax_avx_f32
#define xfmin nsimd_sleef_fmin_avx_f64
#define xfminf nsimd_sleef_fmin_avx_f32
#define xfdim nsimd_sleef_fdim_avx_f64
#define xfdimf nsimd_sleef_fdim_avx_f32
#define xtrunc nsimd_sleef_trunc_avx_f64
#define xtruncf nsimd_sleef_trunc_avx_f32
#define xfloor nsimd_sleef_floor_avx_f64
#define xfloorf nsimd_sleef_floor_avx_f32
#define xceil nsimd_sleef_ceil_avx_f64
#define xceilf nsimd_sleef_ceil_avx_f32
#define xround nsimd_sleef_round_avx_f64
#define xroundf nsimd_sleef_round_avx_f32
#define xrint nsimd_sleef_rint_avx_f64
#define xrintf nsimd_sleef_rint_avx_f32
#define xnextafter nsimd_sleef_nextafter_avx_f64
#define xnextafterf nsimd_sleef_nextafter_avx_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx_f32
#define xfmod nsimd_sleef_fmod_avx_f64
#define xfmodf nsimd_sleef_fmod_avx_f32
#define xremainder nsimd_sleef_remainder_avx_f64
#define xremainderf nsimd_sleef_remainder_avx_f32
#define xmodf nsimd_sleef_modf_avx_f64
#define xmodff nsimd_sleef_modf_avx_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx_f32
#define xerf_u1 nsimd_sleef_erf_u10d_avx_f64
#define xerff_u1 nsimd_sleef_erf_u10d_avx_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_avx_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx_f32
#define xgetInt nsimd_sleef_getInt_avx_f64
#define xgetIntf nsimd_sleef_getInt_avx_f32
#define xgetPtr nsimd_sleef_getPtr_avx_f64
#define xgetPtrf nsimd_sleef_getPtr_avx_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_avx_f64
#define xsinf nsimd_sleef_sin_u35_avx_f32
#define xcos nsimd_sleef_cos_u35_avx_f64
#define xcosf nsimd_sleef_cos_u35_avx_f32
#define xsincos nsimd_sleef_sincos_u35_avx_f64
#define xsincosf nsimd_sleef_sincos_u35_avx_f32
#define xtan nsimd_sleef_tan_u35_avx_f64
#define xtanf nsimd_sleef_tan_u35_avx_f32
#define xasin nsimd_sleef_asin_u35_avx_f64
#define xasinf nsimd_sleef_asin_u35_avx_f32
#define xacos nsimd_sleef_acos_u35_avx_f64
#define xacosf nsimd_sleef_acos_u35_avx_f32
#define xatan nsimd_sleef_atan_u35_avx_f64
#define xatanf nsimd_sleef_atan_u35_avx_f32
#define xatan2 nsimd_sleef_atan2_u35_avx_f64
#define xatan2f nsimd_sleef_atan2_u35_avx_f32
#define xlog nsimd_sleef_log_u35_avx_f64
#define xlogf nsimd_sleef_log_u35_avx_f32
#define xcbrt nsimd_sleef_cbrt_u35_avx_f64
#define xcbrtf nsimd_sleef_cbrt_u35_avx_f32
#define xsin_u1 nsimd_sleef_sin_u10_avx_f64
#define xsinf_u1 nsimd_sleef_sin_u10_avx_f32
#define xcos_u1 nsimd_sleef_cos_u10_avx_f64
#define xcosf_u1 nsimd_sleef_cos_u10_avx_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_avx_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_avx_f32
#define xtan_u1 nsimd_sleef_tan_u10_avx_f64
#define xtanf_u1 nsimd_sleef_tan_u10_avx_f32
#define xasin_u1 nsimd_sleef_asin_u10_avx_f64
#define xasinf_u1 nsimd_sleef_asin_u10_avx_f32
#define xacos_u1 nsimd_sleef_acos_u10_avx_f64
#define xacosf_u1 nsimd_sleef_acos_u10_avx_f32
#define xatan_u1 nsimd_sleef_atan_u10_avx_f64
#define xatanf_u1 nsimd_sleef_atan_u10_avx_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_avx_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_avx_f32
#define xlog_u1 nsimd_sleef_log_u10_avx_f64
#define xlogf_u1 nsimd_sleef_log_u10_avx_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx_f32
#define xexp nsimd_sleef_exp_u10_avx_f64
#define xexpf nsimd_sleef_exp_u10_avx_f32
#define xpow nsimd_sleef_pow_u10_avx_f64
#define xpowf nsimd_sleef_pow_u10_avx_f32
#define xsinh nsimd_sleef_sinh_u10_avx_f64
#define xsinhf nsimd_sleef_sinh_u10_avx_f32
#define xcosh nsimd_sleef_cosh_u10_avx_f64
#define xcoshf nsimd_sleef_cosh_u10_avx_f32
#define xtanh nsimd_sleef_tanh_u10_avx_f64
#define xtanhf nsimd_sleef_tanh_u10_avx_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_avx_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_avx_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_avx_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_avx_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_avx_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_avx_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx_f32
#define xasinh nsimd_sleef_asinh_u10_avx_f64
#define xasinhf nsimd_sleef_asinh_u10_avx_f32
#define xacosh nsimd_sleef_acosh_u10_avx_f64
#define xacoshf nsimd_sleef_acosh_u10_avx_f32
#define xatanh nsimd_sleef_atanh_u10_avx_f64
#define xatanhf nsimd_sleef_atanh_u10_avx_f32
#define xexp2 nsimd_sleef_exp2_u10_avx_f64
#define xexp2f nsimd_sleef_exp2_u10_avx_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_avx_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_avx_f32
#define xexp10 nsimd_sleef_exp10_u10_avx_f64
#define xexp10f nsimd_sleef_exp10_u10_avx_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_avx_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_avx_f32
#define xexpm1 nsimd_sleef_expm1_u10_avx_f64
#define xexpm1f nsimd_sleef_expm1_u10_avx_f32
#define xlog10 nsimd_sleef_log10_u10_avx_f64
#define xlog10f nsimd_sleef_log10_u10_avx_f32
#define xlog2 nsimd_sleef_log2_u10_avx_f64
#define xlog2f nsimd_sleef_log2_u10_avx_f32
#define xlog2_u35 nsimd_sleef_log2_u35_avx_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_avx_f32
#define xlog1p nsimd_sleef_log1p_u10_avx_f64
#define xlog1pf nsimd_sleef_log1p_u10_avx_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_avx_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_avx_f32
#define xldexp nsimd_sleef_ldexp_avx_f64
#define xldexpf nsimd_sleef_ldexp_avx_f32
#define xilogb nsimd_sleef_ilogb_avx_f64
#define xilogbf nsimd_sleef_ilogb_avx_f32
#define xfma nsimd_sleef_fma_avx_f64
#define xfmaf nsimd_sleef_fma_avx_f32
#define xsqrt nsimd_sleef_sqrt_avx_f64
#define xsqrtf nsimd_sleef_sqrt_avx_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_avx_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_avx_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_avx_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_avx_f32
#define xfabs nsimd_sleef_fabs_avx_f64
#define xfabsf nsimd_sleef_fabs_avx_f32
#define xcopysign nsimd_sleef_copysign_avx_f64
#define xcopysignf nsimd_sleef_copysign_avx_f32
#define xfmax nsimd_sleef_fmax_avx_f64
#define xfmaxf nsimd_sleef_fmax_avx_f32
#define xfmin nsimd_sleef_fmin_avx_f64
#define xfminf nsimd_sleef_fmin_avx_f32
#define xfdim nsimd_sleef_fdim_avx_f64
#define xfdimf nsimd_sleef_fdim_avx_f32
#define xtrunc nsimd_sleef_trunc_avx_f64
#define xtruncf nsimd_sleef_trunc_avx_f32
#define xfloor nsimd_sleef_floor_avx_f64
#define xfloorf nsimd_sleef_floor_avx_f32
#define xceil nsimd_sleef_ceil_avx_f64
#define xceilf nsimd_sleef_ceil_avx_f32
#define xround nsimd_sleef_round_avx_f64
#define xroundf nsimd_sleef_round_avx_f32
#define xrint nsimd_sleef_rint_avx_f64
#define xrintf nsimd_sleef_rint_avx_f32
#define xnextafter nsimd_sleef_nextafter_avx_f64
#define xnextafterf nsimd_sleef_nextafter_avx_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx_f32
#define xfmod nsimd_sleef_fmod_avx_f64
#define xfmodf nsimd_sleef_fmod_avx_f32
#define xremainder nsimd_sleef_remainder_avx_f64
#define xremainderf nsimd_sleef_remainder_avx_f32
#define xmodf nsimd_sleef_modf_avx_f64
#define xmodff nsimd_sleef_modf_avx_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx_f32
#define xerf_u1 nsimd_sleef_erf_u10_avx_f64
#define xerff_u1 nsimd_sleef_erf_u10_avx_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_avx_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_avx_f32
#define xgetInt nsimd_sleef_getInt_avx_f64
#define xgetIntf nsimd_sleef_getInt_avx_f32
#define xgetPtr nsimd_sleef_getPtr_avx_f64
#define xgetPtrf nsimd_sleef_getPtr_avx_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_avx
                   #define rempif nsimd_sleef_rempif_avx
                   #define rempisub nsimd_sleef_rempisub_avx
                   #define rempisubf nsimd_sleef_rempisubf_avx
                   #define gammak nsimd_gammak_avx
                   #define gammafk nsimd_gammafk_avx

                   #endif

                   
#endif


================================================
FILE: src/renameavx2.h
================================================
#ifndef RENAMEAVX2_H
               #define RENAMEAVX2_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions avx2 */

                   #ifdef NSIMD_AVX2

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_avx2_f64
#define xsinf nsimd_sleef_sin_u35d_avx2_f32
#define xcos nsimd_sleef_cos_u35d_avx2_f64
#define xcosf nsimd_sleef_cos_u35d_avx2_f32
#define xsincos nsimd_sleef_sincos_u35d_avx2_f64
#define xsincosf nsimd_sleef_sincos_u35d_avx2_f32
#define xtan nsimd_sleef_tan_u35d_avx2_f64
#define xtanf nsimd_sleef_tan_u35d_avx2_f32
#define xasin nsimd_sleef_asin_u35d_avx2_f64
#define xasinf nsimd_sleef_asin_u35d_avx2_f32
#define xacos nsimd_sleef_acos_u35d_avx2_f64
#define xacosf nsimd_sleef_acos_u35d_avx2_f32
#define xatan nsimd_sleef_atan_u35d_avx2_f64
#define xatanf nsimd_sleef_atan_u35d_avx2_f32
#define xatan2 nsimd_sleef_atan2_u35d_avx2_f64
#define xatan2f nsimd_sleef_atan2_u35d_avx2_f32
#define xlog nsimd_sleef_log_u35d_avx2_f64
#define xlogf nsimd_sleef_log_u35d_avx2_f32
#define xcbrt nsimd_sleef_cbrt_u35d_avx2_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_avx2_f32
#define xsin_u1 nsimd_sleef_sin_u10d_avx2_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_avx2_f32
#define xcos_u1 nsimd_sleef_cos_u10d_avx2_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_avx2_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_avx2_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx2_f32
#define xtan_u1 nsimd_sleef_tan_u10d_avx2_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_avx2_f32
#define xasin_u1 nsimd_sleef_asin_u10d_avx2_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_avx2_f32
#define xacos_u1 nsimd_sleef_acos_u10d_avx2_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_avx2_f32
#define xatan_u1 nsimd_sleef_atan_u10d_avx2_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_avx2_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_avx2_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx2_f32
#define xlog_u1 nsimd_sleef_log_u10d_avx2_f64
#define xlogf_u1 nsimd_sleef_log_u10d_avx2_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx2_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx2_f32
#define xexp nsimd_sleef_exp_u10d_avx2_f64
#define xexpf nsimd_sleef_exp_u10d_avx2_f32
#define xpow nsimd_sleef_pow_u10d_avx2_f64
#define xpowf nsimd_sleef_pow_u10d_avx2_f32
#define xsinh nsimd_sleef_sinh_u10d_avx2_f64
#define xsinhf nsimd_sleef_sinh_u10d_avx2_f32
#define xcosh nsimd_sleef_cosh_u10d_avx2_f64
#define xcoshf nsimd_sleef_cosh_u10d_avx2_f32
#define xtanh nsimd_sleef_tanh_u10d_avx2_f64
#define xtanhf nsimd_sleef_tanh_u10d_avx2_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_avx2_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx2_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_avx2_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx2_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_avx2_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx2_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx2_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx2_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx2_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx2_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx2_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx2_f32
#define xasinh nsimd_sleef_asinh_u10d_avx2_f64
#define xasinhf nsimd_sleef_asinh_u10d_avx2_f32
#define xacosh nsimd_sleef_acosh_u10d_avx2_f64
#define xacoshf nsimd_sleef_acosh_u10d_avx2_f32
#define xatanh nsimd_sleef_atanh_u10d_avx2_f64
#define xatanhf nsimd_sleef_atanh_u10d_avx2_f32
#define xexp2 nsimd_sleef_exp2_u10d_avx2_f64
#define xexp2f nsimd_sleef_exp2_u10d_avx2_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_avx2_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx2_f32
#define xexp10 nsimd_sleef_exp10_u10d_avx2_f64
#define xexp10f nsimd_sleef_exp10_u10d_avx2_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_avx2_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx2_f32
#define xexpm1 nsimd_sleef_expm1_u10d_avx2_f64
#define xexpm1f nsimd_sleef_expm1_u10d_avx2_f32
#define xlog10 nsimd_sleef_log10_u10d_avx2_f64
#define xlog10f nsimd_sleef_log10_u10d_avx2_f32
#define xlog2 nsimd_sleef_log2_u10d_avx2_f64
#define xlog2f nsimd_sleef_log2_u10d_avx2_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_avx2_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_avx2_f32
#define xlog1p nsimd_sleef_log1p_u10d_avx2_f64
#define xlog1pf nsimd_sleef_log1p_u10d_avx2_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx2_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx2_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx2_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx2_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx2_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx2_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_avx2_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_avx2_f32
#define xldexp nsimd_sleef_ldexp_avx2_f64
#define xldexpf nsimd_sleef_ldexp_avx2_f32
#define xilogb nsimd_sleef_ilogb_avx2_f64
#define xilogbf nsimd_sleef_ilogb_avx2_f32
#define xfma nsimd_sleef_fma_avx2_f64
#define xfmaf nsimd_sleef_fma_avx2_f32
#define xsqrt nsimd_sleef_sqrt_avx2_f64
#define xsqrtf nsimd_sleef_sqrt_avx2_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx2_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx2_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx2_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx2_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_avx2_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx2_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_avx2_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx2_f32
#define xfabs nsimd_sleef_fabs_avx2_f64
#define xfabsf nsimd_sleef_fabs_avx2_f32
#define xcopysign nsimd_sleef_copysign_avx2_f64
#define xcopysignf nsimd_sleef_copysign_avx2_f32
#define xfmax nsimd_sleef_fmax_avx2_f64
#define xfmaxf nsimd_sleef_fmax_avx2_f32
#define xfmin nsimd_sleef_fmin_avx2_f64
#define xfminf nsimd_sleef_fmin_avx2_f32
#define xfdim nsimd_sleef_fdim_avx2_f64
#define xfdimf nsimd_sleef_fdim_avx2_f32
#define xtrunc nsimd_sleef_trunc_avx2_f64
#define xtruncf nsimd_sleef_trunc_avx2_f32
#define xfloor nsimd_sleef_floor_avx2_f64
#define xfloorf nsimd_sleef_floor_avx2_f32
#define xceil nsimd_sleef_ceil_avx2_f64
#define xceilf nsimd_sleef_ceil_avx2_f32
#define xround nsimd_sleef_round_avx2_f64
#define xroundf nsimd_sleef_round_avx2_f32
#define xrint nsimd_sleef_rint_avx2_f64
#define xrintf nsimd_sleef_rint_avx2_f32
#define xnextafter nsimd_sleef_nextafter_avx2_f64
#define xnextafterf nsimd_sleef_nextafter_avx2_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx2_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx2_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx2_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx2_f32
#define xfmod nsimd_sleef_fmod_avx2_f64
#define xfmodf nsimd_sleef_fmod_avx2_f32
#define xremainder nsimd_sleef_remainder_avx2_f64
#define xremainderf nsimd_sleef_remainder_avx2_f32
#define xmodf nsimd_sleef_modf_avx2_f64
#define xmodff nsimd_sleef_modf_avx2_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx2_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx2_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx2_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx2_f32
#define xerf_u1 nsimd_sleef_erf_u10d_avx2_f64
#define xerff_u1 nsimd_sleef_erf_u10d_avx2_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_avx2_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx2_f32
#define xgetInt nsimd_sleef_getInt_avx2_f64
#define xgetIntf nsimd_sleef_getInt_avx2_f32
#define xgetPtr nsimd_sleef_getPtr_avx2_f64
#define xgetPtrf nsimd_sleef_getPtr_avx2_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_avx2_f64
#define xsinf nsimd_sleef_sin_u35_avx2_f32
#define xcos nsimd_sleef_cos_u35_avx2_f64
#define xcosf nsimd_sleef_cos_u35_avx2_f32
#define xsincos nsimd_sleef_sincos_u35_avx2_f64
#define xsincosf nsimd_sleef_sincos_u35_avx2_f32
#define xtan nsimd_sleef_tan_u35_avx2_f64
#define xtanf nsimd_sleef_tan_u35_avx2_f32
#define xasin nsimd_sleef_asin_u35_avx2_f64
#define xasinf nsimd_sleef_asin_u35_avx2_f32
#define xacos nsimd_sleef_acos_u35_avx2_f64
#define xacosf nsimd_sleef_acos_u35_avx2_f32
#define xatan nsimd_sleef_atan_u35_avx2_f64
#define xatanf nsimd_sleef_atan_u35_avx2_f32
#define xatan2 nsimd_sleef_atan2_u35_avx2_f64
#define xatan2f nsimd_sleef_atan2_u35_avx2_f32
#define xlog nsimd_sleef_log_u35_avx2_f64
#define xlogf nsimd_sleef_log_u35_avx2_f32
#define xcbrt nsimd_sleef_cbrt_u35_avx2_f64
#define xcbrtf nsimd_sleef_cbrt_u35_avx2_f32
#define xsin_u1 nsimd_sleef_sin_u10_avx2_f64
#define xsinf_u1 nsimd_sleef_sin_u10_avx2_f32
#define xcos_u1 nsimd_sleef_cos_u10_avx2_f64
#define xcosf_u1 nsimd_sleef_cos_u10_avx2_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_avx2_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_avx2_f32
#define xtan_u1 nsimd_sleef_tan_u10_avx2_f64
#define xtanf_u1 nsimd_sleef_tan_u10_avx2_f32
#define xasin_u1 nsimd_sleef_asin_u10_avx2_f64
#define xasinf_u1 nsimd_sleef_asin_u10_avx2_f32
#define xacos_u1 nsimd_sleef_acos_u10_avx2_f64
#define xacosf_u1 nsimd_sleef_acos_u10_avx2_f32
#define xatan_u1 nsimd_sleef_atan_u10_avx2_f64
#define xatanf_u1 nsimd_sleef_atan_u10_avx2_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_avx2_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_avx2_f32
#define xlog_u1 nsimd_sleef_log_u10_avx2_f64
#define xlogf_u1 nsimd_sleef_log_u10_avx2_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx2_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx2_f32
#define xexp nsimd_sleef_exp_u10_avx2_f64
#define xexpf nsimd_sleef_exp_u10_avx2_f32
#define xpow nsimd_sleef_pow_u10_avx2_f64
#define xpowf nsimd_sleef_pow_u10_avx2_f32
#define xsinh nsimd_sleef_sinh_u10_avx2_f64
#define xsinhf nsimd_sleef_sinh_u10_avx2_f32
#define xcosh nsimd_sleef_cosh_u10_avx2_f64
#define xcoshf nsimd_sleef_cosh_u10_avx2_f32
#define xtanh nsimd_sleef_tanh_u10_avx2_f64
#define xtanhf nsimd_sleef_tanh_u10_avx2_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_avx2_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_avx2_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_avx2_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_avx2_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_avx2_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_avx2_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx2_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx2_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx2_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx2_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx2_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx2_f32
#define xasinh nsimd_sleef_asinh_u10_avx2_f64
#define xasinhf nsimd_sleef_asinh_u10_avx2_f32
#define xacosh nsimd_sleef_acosh_u10_avx2_f64
#define xacoshf nsimd_sleef_acosh_u10_avx2_f32
#define xatanh nsimd_sleef_atanh_u10_avx2_f64
#define xatanhf nsimd_sleef_atanh_u10_avx2_f32
#define xexp2 nsimd_sleef_exp2_u10_avx2_f64
#define xexp2f nsimd_sleef_exp2_u10_avx2_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_avx2_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_avx2_f32
#define xexp10 nsimd_sleef_exp10_u10_avx2_f64
#define xexp10f nsimd_sleef_exp10_u10_avx2_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_avx2_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_avx2_f32
#define xexpm1 nsimd_sleef_expm1_u10_avx2_f64
#define xexpm1f nsimd_sleef_expm1_u10_avx2_f32
#define xlog10 nsimd_sleef_log10_u10_avx2_f64
#define xlog10f nsimd_sleef_log10_u10_avx2_f32
#define xlog2 nsimd_sleef_log2_u10_avx2_f64
#define xlog2f nsimd_sleef_log2_u10_avx2_f32
#define xlog2_u35 nsimd_sleef_log2_u35_avx2_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_avx2_f32
#define xlog1p nsimd_sleef_log1p_u10_avx2_f64
#define xlog1pf nsimd_sleef_log1p_u10_avx2_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx2_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx2_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx2_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx2_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx2_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx2_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_avx2_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_avx2_f32
#define xldexp nsimd_sleef_ldexp_avx2_f64
#define xldexpf nsimd_sleef_ldexp_avx2_f32
#define xilogb nsimd_sleef_ilogb_avx2_f64
#define xilogbf nsimd_sleef_ilogb_avx2_f32
#define xfma nsimd_sleef_fma_avx2_f64
#define xfmaf nsimd_sleef_fma_avx2_f32
#define xsqrt nsimd_sleef_sqrt_avx2_f64
#define xsqrtf nsimd_sleef_sqrt_avx2_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx2_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx2_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx2_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx2_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_avx2_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_avx2_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_avx2_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_avx2_f32
#define xfabs nsimd_sleef_fabs_avx2_f64
#define xfabsf nsimd_sleef_fabs_avx2_f32
#define xcopysign nsimd_sleef_copysign_avx2_f64
#define xcopysignf nsimd_sleef_copysign_avx2_f32
#define xfmax nsimd_sleef_fmax_avx2_f64
#define xfmaxf nsimd_sleef_fmax_avx2_f32
#define xfmin nsimd_sleef_fmin_avx2_f64
#define xfminf nsimd_sleef_fmin_avx2_f32
#define xfdim nsimd_sleef_fdim_avx2_f64
#define xfdimf nsimd_sleef_fdim_avx2_f32
#define xtrunc nsimd_sleef_trunc_avx2_f64
#define xtruncf nsimd_sleef_trunc_avx2_f32
#define xfloor nsimd_sleef_floor_avx2_f64
#define xfloorf nsimd_sleef_floor_avx2_f32
#define xceil nsimd_sleef_ceil_avx2_f64
#define xceilf nsimd_sleef_ceil_avx2_f32
#define xround nsimd_sleef_round_avx2_f64
#define xroundf nsimd_sleef_round_avx2_f32
#define xrint nsimd_sleef_rint_avx2_f64
#define xrintf nsimd_sleef_rint_avx2_f32
#define xnextafter nsimd_sleef_nextafter_avx2_f64
#define xnextafterf nsimd_sleef_nextafter_avx2_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx2_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx2_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx2_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx2_f32
#define xfmod nsimd_sleef_fmod_avx2_f64
#define xfmodf nsimd_sleef_fmod_avx2_f32
#define xremainder nsimd_sleef_remainder_avx2_f64
#define xremainderf nsimd_sleef_remainder_avx2_f32
#define xmodf nsimd_sleef_modf_avx2_f64
#define xmodff nsimd_sleef_modf_avx2_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx2_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx2_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx2_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx2_f32
#define xerf_u1 nsimd_sleef_erf_u10_avx2_f64
#define xerff_u1 nsimd_sleef_erf_u10_avx2_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_avx2_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_avx2_f32
#define xgetInt nsimd_sleef_getInt_avx2_f64
#define xgetIntf nsimd_sleef_getInt_avx2_f32
#define xgetPtr nsimd_sleef_getPtr_avx2_f64
#define xgetPtrf nsimd_sleef_getPtr_avx2_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_avx2
                   #define rempif nsimd_sleef_rempif_avx2
                   #define rempisub nsimd_sleef_rempisub_avx2
                   #define rempisubf nsimd_sleef_rempisubf_avx2
                   #define gammak nsimd_gammak_avx2
                   #define gammafk nsimd_gammafk_avx2

                   #endif

                   
#endif


================================================
FILE: src/renameavx512f.h
================================================
#ifndef RENAMEAVX512F_H
               #define RENAMEAVX512F_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions avx512_knl */

                   #ifdef NSIMD_AVX512_KNL

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_avx512_knl_f64
#define xsinf nsimd_sleef_sin_u35d_avx512_knl_f32
#define xcos nsimd_sleef_cos_u35d_avx512_knl_f64
#define xcosf nsimd_sleef_cos_u35d_avx512_knl_f32
#define xsincos nsimd_sleef_sincos_u35d_avx512_knl_f64
#define xsincosf nsimd_sleef_sincos_u35d_avx512_knl_f32
#define xtan nsimd_sleef_tan_u35d_avx512_knl_f64
#define xtanf nsimd_sleef_tan_u35d_avx512_knl_f32
#define xasin nsimd_sleef_asin_u35d_avx512_knl_f64
#define xasinf nsimd_sleef_asin_u35d_avx512_knl_f32
#define xacos nsimd_sleef_acos_u35d_avx512_knl_f64
#define xacosf nsimd_sleef_acos_u35d_avx512_knl_f32
#define xatan nsimd_sleef_atan_u35d_avx512_knl_f64
#define xatanf nsimd_sleef_atan_u35d_avx512_knl_f32
#define xatan2 nsimd_sleef_atan2_u35d_avx512_knl_f64
#define xatan2f nsimd_sleef_atan2_u35d_avx512_knl_f32
#define xlog nsimd_sleef_log_u35d_avx512_knl_f64
#define xlogf nsimd_sleef_log_u35d_avx512_knl_f32
#define xcbrt nsimd_sleef_cbrt_u35d_avx512_knl_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_avx512_knl_f32
#define xsin_u1 nsimd_sleef_sin_u10d_avx512_knl_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_avx512_knl_f32
#define xcos_u1 nsimd_sleef_cos_u10d_avx512_knl_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_avx512_knl_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_avx512_knl_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx512_knl_f32
#define xtan_u1 nsimd_sleef_tan_u10d_avx512_knl_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_avx512_knl_f32
#define xasin_u1 nsimd_sleef_asin_u10d_avx512_knl_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_avx512_knl_f32
#define xacos_u1 nsimd_sleef_acos_u10d_avx512_knl_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_avx512_knl_f32
#define xatan_u1 nsimd_sleef_atan_u10d_avx512_knl_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_avx512_knl_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_avx512_knl_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx512_knl_f32
#define xlog_u1 nsimd_sleef_log_u10d_avx512_knl_f64
#define xlogf_u1 nsimd_sleef_log_u10d_avx512_knl_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx512_knl_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx512_knl_f32
#define xexp nsimd_sleef_exp_u10d_avx512_knl_f64
#define xexpf nsimd_sleef_exp_u10d_avx512_knl_f32
#define xpow nsimd_sleef_pow_u10d_avx512_knl_f64
#define xpowf nsimd_sleef_pow_u10d_avx512_knl_f32
#define xsinh nsimd_sleef_sinh_u10d_avx512_knl_f64
#define xsinhf nsimd_sleef_sinh_u10d_avx512_knl_f32
#define xcosh nsimd_sleef_cosh_u10d_avx512_knl_f64
#define xcoshf nsimd_sleef_cosh_u10d_avx512_knl_f32
#define xtanh nsimd_sleef_tanh_u10d_avx512_knl_f64
#define xtanhf nsimd_sleef_tanh_u10d_avx512_knl_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_avx512_knl_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx512_knl_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_avx512_knl_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx512_knl_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_avx512_knl_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx512_knl_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx512_knl_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx512_knl_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx512_knl_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx512_knl_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx512_knl_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx512_knl_f32
#define xasinh nsimd_sleef_asinh_u10d_avx512_knl_f64
#define xasinhf nsimd_sleef_asinh_u10d_avx512_knl_f32
#define xacosh nsimd_sleef_acosh_u10d_avx512_knl_f64
#define xacoshf nsimd_sleef_acosh_u10d_avx512_knl_f32
#define xatanh nsimd_sleef_atanh_u10d_avx512_knl_f64
#define xatanhf nsimd_sleef_atanh_u10d_avx512_knl_f32
#define xexp2 nsimd_sleef_exp2_u10d_avx512_knl_f64
#define xexp2f nsimd_sleef_exp2_u10d_avx512_knl_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_avx512_knl_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx512_knl_f32
#define xexp10 nsimd_sleef_exp10_u10d_avx512_knl_f64
#define xexp10f nsimd_sleef_exp10_u10d_avx512_knl_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_avx512_knl_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx512_knl_f32
#define xexpm1 nsimd_sleef_expm1_u10d_avx512_knl_f64
#define xexpm1f nsimd_sleef_expm1_u10d_avx512_knl_f32
#define xlog10 nsimd_sleef_log10_u10d_avx512_knl_f64
#define xlog10f nsimd_sleef_log10_u10d_avx512_knl_f32
#define xlog2 nsimd_sleef_log2_u10d_avx512_knl_f64
#define xlog2f nsimd_sleef_log2_u10d_avx512_knl_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_avx512_knl_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_avx512_knl_f32
#define xlog1p nsimd_sleef_log1p_u10d_avx512_knl_f64
#define xlog1pf nsimd_sleef_log1p_u10d_avx512_knl_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx512_knl_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx512_knl_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx512_knl_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx512_knl_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx512_knl_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx512_knl_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_avx512_knl_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_avx512_knl_f32
#define xldexp nsimd_sleef_ldexp_avx512_knl_f64
#define xldexpf nsimd_sleef_ldexp_avx512_knl_f32
#define xilogb nsimd_sleef_ilogb_avx512_knl_f64
#define xilogbf nsimd_sleef_ilogb_avx512_knl_f32
#define xfma nsimd_sleef_fma_avx512_knl_f64
#define xfmaf nsimd_sleef_fma_avx512_knl_f32
#define xsqrt nsimd_sleef_sqrt_avx512_knl_f64
#define xsqrtf nsimd_sleef_sqrt_avx512_knl_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx512_knl_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx512_knl_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx512_knl_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx512_knl_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_avx512_knl_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx512_knl_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_avx512_knl_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx512_knl_f32
#define xfabs nsimd_sleef_fabs_avx512_knl_f64
#define xfabsf nsimd_sleef_fabs_avx512_knl_f32
#define xcopysign nsimd_sleef_copysign_avx512_knl_f64
#define xcopysignf nsimd_sleef_copysign_avx512_knl_f32
#define xfmax nsimd_sleef_fmax_avx512_knl_f64
#define xfmaxf nsimd_sleef_fmax_avx512_knl_f32
#define xfmin nsimd_sleef_fmin_avx512_knl_f64
#define xfminf nsimd_sleef_fmin_avx512_knl_f32
#define xfdim nsimd_sleef_fdim_avx512_knl_f64
#define xfdimf nsimd_sleef_fdim_avx512_knl_f32
#define xtrunc nsimd_sleef_trunc_avx512_knl_f64
#define xtruncf nsimd_sleef_trunc_avx512_knl_f32
#define xfloor nsimd_sleef_floor_avx512_knl_f64
#define xfloorf nsimd_sleef_floor_avx512_knl_f32
#define xceil nsimd_sleef_ceil_avx512_knl_f64
#define xceilf nsimd_sleef_ceil_avx512_knl_f32
#define xround nsimd_sleef_round_avx512_knl_f64
#define xroundf nsimd_sleef_round_avx512_knl_f32
#define xrint nsimd_sleef_rint_avx512_knl_f64
#define xrintf nsimd_sleef_rint_avx512_knl_f32
#define xnextafter nsimd_sleef_nextafter_avx512_knl_f64
#define xnextafterf nsimd_sleef_nextafter_avx512_knl_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx512_knl_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx512_knl_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx512_knl_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx512_knl_f32
#define xfmod nsimd_sleef_fmod_avx512_knl_f64
#define xfmodf nsimd_sleef_fmod_avx512_knl_f32
#define xremainder nsimd_sleef_remainder_avx512_knl_f64
#define xremainderf nsimd_sleef_remainder_avx512_knl_f32
#define xmodf nsimd_sleef_modf_avx512_knl_f64
#define xmodff nsimd_sleef_modf_avx512_knl_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx512_knl_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx512_knl_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx512_knl_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx512_knl_f32
#define xerf_u1 nsimd_sleef_erf_u10d_avx512_knl_f64
#define xerff_u1 nsimd_sleef_erf_u10d_avx512_knl_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_avx512_knl_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx512_knl_f32
#define xgetInt nsimd_sleef_getInt_avx512_knl_f64
#define xgetIntf nsimd_sleef_getInt_avx512_knl_f32
#define xgetPtr nsimd_sleef_getPtr_avx512_knl_f64
#define xgetPtrf nsimd_sleef_getPtr_avx512_knl_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_avx512_knl_f64
#define xsinf nsimd_sleef_sin_u35_avx512_knl_f32
#define xcos nsimd_sleef_cos_u35_avx512_knl_f64
#define xcosf nsimd_sleef_cos_u35_avx512_knl_f32
#define xsincos nsimd_sleef_sincos_u35_avx512_knl_f64
#define xsincosf nsimd_sleef_sincos_u35_avx512_knl_f32
#define xtan nsimd_sleef_tan_u35_avx512_knl_f64
#define xtanf nsimd_sleef_tan_u35_avx512_knl_f32
#define xasin nsimd_sleef_asin_u35_avx512_knl_f64
#define xasinf nsimd_sleef_asin_u35_avx512_knl_f32
#define xacos nsimd_sleef_acos_u35_avx512_knl_f64
#define xacosf nsimd_sleef_acos_u35_avx512_knl_f32
#define xatan nsimd_sleef_atan_u35_avx512_knl_f64
#define xatanf nsimd_sleef_atan_u35_avx512_knl_f32
#define xatan2 nsimd_sleef_atan2_u35_avx512_knl_f64
#define xatan2f nsimd_sleef_atan2_u35_avx512_knl_f32
#define xlog nsimd_sleef_log_u35_avx512_knl_f64
#define xlogf nsimd_sleef_log_u35_avx512_knl_f32
#define xcbrt nsimd_sleef_cbrt_u35_avx512_knl_f64
#define xcbrtf nsimd_sleef_cbrt_u35_avx512_knl_f32
#define xsin_u1 nsimd_sleef_sin_u10_avx512_knl_f64
#define xsinf_u1 nsimd_sleef_sin_u10_avx512_knl_f32
#define xcos_u1 nsimd_sleef_cos_u10_avx512_knl_f64
#define xcosf_u1 nsimd_sleef_cos_u10_avx512_knl_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_avx512_knl_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_avx512_knl_f32
#define xtan_u1 nsimd_sleef_tan_u10_avx512_knl_f64
#define xtanf_u1 nsimd_sleef_tan_u10_avx512_knl_f32
#define xasin_u1 nsimd_sleef_asin_u10_avx512_knl_f64
#define xasinf_u1 nsimd_sleef_asin_u10_avx512_knl_f32
#define xacos_u1 nsimd_sleef_acos_u10_avx512_knl_f64
#define xacosf_u1 nsimd_sleef_acos_u10_avx512_knl_f32
#define xatan_u1 nsimd_sleef_atan_u10_avx512_knl_f64
#define xatanf_u1 nsimd_sleef_atan_u10_avx512_knl_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_avx512_knl_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_avx512_knl_f32
#define xlog_u1 nsimd_sleef_log_u10_avx512_knl_f64
#define xlogf_u1 nsimd_sleef_log_u10_avx512_knl_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx512_knl_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx512_knl_f32
#define xexp nsimd_sleef_exp_u10_avx512_knl_f64
#define xexpf nsimd_sleef_exp_u10_avx512_knl_f32
#define xpow nsimd_sleef_pow_u10_avx512_knl_f64
#define xpowf nsimd_sleef_pow_u10_avx512_knl_f32
#define xsinh nsimd_sleef_sinh_u10_avx512_knl_f64
#define xsinhf nsimd_sleef_sinh_u10_avx512_knl_f32
#define xcosh nsimd_sleef_cosh_u10_avx512_knl_f64
#define xcoshf nsimd_sleef_cosh_u10_avx512_knl_f32
#define xtanh nsimd_sleef_tanh_u10_avx512_knl_f64
#define xtanhf nsimd_sleef_tanh_u10_avx512_knl_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_avx512_knl_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_avx512_knl_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_avx512_knl_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_avx512_knl_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_avx512_knl_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_avx512_knl_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx512_knl_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx512_knl_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx512_knl_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx512_knl_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx512_knl_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx512_knl_f32
#define xasinh nsimd_sleef_asinh_u10_avx512_knl_f64
#define xasinhf nsimd_sleef_asinh_u10_avx512_knl_f32
#define xacosh nsimd_sleef_acosh_u10_avx512_knl_f64
#define xacoshf nsimd_sleef_acosh_u10_avx512_knl_f32
#define xatanh nsimd_sleef_atanh_u10_avx512_knl_f64
#define xatanhf nsimd_sleef_atanh_u10_avx512_knl_f32
#define xexp2 nsimd_sleef_exp2_u10_avx512_knl_f64
#define xexp2f nsimd_sleef_exp2_u10_avx512_knl_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_avx512_knl_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_avx512_knl_f32
#define xexp10 nsimd_sleef_exp10_u10_avx512_knl_f64
#define xexp10f nsimd_sleef_exp10_u10_avx512_knl_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_avx512_knl_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_avx512_knl_f32
#define xexpm1 nsimd_sleef_expm1_u10_avx512_knl_f64
#define xexpm1f nsimd_sleef_expm1_u10_avx512_knl_f32
#define xlog10 nsimd_sleef_log10_u10_avx512_knl_f64
#define xlog10f nsimd_sleef_log10_u10_avx512_knl_f32
#define xlog2 nsimd_sleef_log2_u10_avx512_knl_f64
#define xlog2f nsimd_sleef_log2_u10_avx512_knl_f32
#define xlog2_u35 nsimd_sleef_log2_u35_avx512_knl_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_avx512_knl_f32
#define xlog1p nsimd_sleef_log1p_u10_avx512_knl_f64
#define xlog1pf nsimd_sleef_log1p_u10_avx512_knl_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx512_knl_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx512_knl_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx512_knl_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx512_knl_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx512_knl_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx512_knl_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_avx512_knl_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_avx512_knl_f32
#define xldexp nsimd_sleef_ldexp_avx512_knl_f64
#define xldexpf nsimd_sleef_ldexp_avx512_knl_f32
#define xilogb nsimd_sleef_ilogb_avx512_knl_f64
#define xilogbf nsimd_sleef_ilogb_avx512_knl_f32
#define xfma nsimd_sleef_fma_avx512_knl_f64
#define xfmaf nsimd_sleef_fma_avx512_knl_f32
#define xsqrt nsimd_sleef_sqrt_avx512_knl_f64
#define xsqrtf nsimd_sleef_sqrt_avx512_knl_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx512_knl_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx512_knl_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx512_knl_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx512_knl_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_avx512_knl_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_avx512_knl_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_avx512_knl_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_avx512_knl_f32
#define xfabs nsimd_sleef_fabs_avx512_knl_f64
#define xfabsf nsimd_sleef_fabs_avx512_knl_f32
#define xcopysign nsimd_sleef_copysign_avx512_knl_f64
#define xcopysignf nsimd_sleef_copysign_avx512_knl_f32
#define xfmax nsimd_sleef_fmax_avx512_knl_f64
#define xfmaxf nsimd_sleef_fmax_avx512_knl_f32
#define xfmin nsimd_sleef_fmin_avx512_knl_f64
#define xfminf nsimd_sleef_fmin_avx512_knl_f32
#define xfdim nsimd_sleef_fdim_avx512_knl_f64
#define xfdimf nsimd_sleef_fdim_avx512_knl_f32
#define xtrunc nsimd_sleef_trunc_avx512_knl_f64
#define xtruncf nsimd_sleef_trunc_avx512_knl_f32
#define xfloor nsimd_sleef_floor_avx512_knl_f64
#define xfloorf nsimd_sleef_floor_avx512_knl_f32
#define xceil nsimd_sleef_ceil_avx512_knl_f64
#define xceilf nsimd_sleef_ceil_avx512_knl_f32
#define xround nsimd_sleef_round_avx512_knl_f64
#define xroundf nsimd_sleef_round_avx512_knl_f32
#define xrint nsimd_sleef_rint_avx512_knl_f64
#define xrintf nsimd_sleef_rint_avx512_knl_f32
#define xnextafter nsimd_sleef_nextafter_avx512_knl_f64
#define xnextafterf nsimd_sleef_nextafter_avx512_knl_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx512_knl_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx512_knl_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx512_knl_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx512_knl_f32
#define xfmod nsimd_sleef_fmod_avx512_knl_f64
#define xfmodf nsimd_sleef_fmod_avx512_knl_f32
#define xremainder nsimd_sleef_remainder_avx512_knl_f64
#define xremainderf nsimd_sleef_remainder_avx512_knl_f32
#define xmodf nsimd_sleef_modf_avx512_knl_f64
#define xmodff nsimd_sleef_modf_avx512_knl_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx512_knl_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx512_knl_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx512_knl_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx512_knl_f32
#define xerf_u1 nsimd_sleef_erf_u10_avx512_knl_f64
#define xerff_u1 nsimd_sleef_erf_u10_avx512_knl_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_avx512_knl_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_avx512_knl_f32
#define xgetInt nsimd_sleef_getInt_avx512_knl_f64
#define xgetIntf nsimd_sleef_getInt_avx512_knl_f32
#define xgetPtr nsimd_sleef_getPtr_avx512_knl_f64
#define xgetPtrf nsimd_sleef_getPtr_avx512_knl_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_avx512_knl
                   #define rempif nsimd_sleef_rempif_avx512_knl
                   #define rempisub nsimd_sleef_rempisub_avx512_knl
                   #define rempisubf nsimd_sleef_rempisubf_avx512_knl
                   #define gammak nsimd_gammak_avx512_knl
                   #define gammafk nsimd_gammafk_avx512_knl

                   #endif

                   /* ------------------------------------------------------------------------- */
                   /* Naming of functions avx512_skylake */

                   #ifdef NSIMD_AVX512_SKYLAKE

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_avx512_skylake_f64
#define xsinf nsimd_sleef_sin_u35d_avx512_skylake_f32
#define xcos nsimd_sleef_cos_u35d_avx512_skylake_f64
#define xcosf nsimd_sleef_cos_u35d_avx512_skylake_f32
#define xsincos nsimd_sleef_sincos_u35d_avx512_skylake_f64
#define xsincosf nsimd_sleef_sincos_u35d_avx512_skylake_f32
#define xtan nsimd_sleef_tan_u35d_avx512_skylake_f64
#define xtanf nsimd_sleef_tan_u35d_avx512_skylake_f32
#define xasin nsimd_sleef_asin_u35d_avx512_skylake_f64
#define xasinf nsimd_sleef_asin_u35d_avx512_skylake_f32
#define xacos nsimd_sleef_acos_u35d_avx512_skylake_f64
#define xacosf nsimd_sleef_acos_u35d_avx512_skylake_f32
#define xatan nsimd_sleef_atan_u35d_avx512_skylake_f64
#define xatanf nsimd_sleef_atan_u35d_avx512_skylake_f32
#define xatan2 nsimd_sleef_atan2_u35d_avx512_skylake_f64
#define xatan2f nsimd_sleef_atan2_u35d_avx512_skylake_f32
#define xlog nsimd_sleef_log_u35d_avx512_skylake_f64
#define xlogf nsimd_sleef_log_u35d_avx512_skylake_f32
#define xcbrt nsimd_sleef_cbrt_u35d_avx512_skylake_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_avx512_skylake_f32
#define xsin_u1 nsimd_sleef_sin_u10d_avx512_skylake_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_avx512_skylake_f32
#define xcos_u1 nsimd_sleef_cos_u10d_avx512_skylake_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_avx512_skylake_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_avx512_skylake_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx512_skylake_f32
#define xtan_u1 nsimd_sleef_tan_u10d_avx512_skylake_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_avx512_skylake_f32
#define xasin_u1 nsimd_sleef_asin_u10d_avx512_skylake_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_avx512_skylake_f32
#define xacos_u1 nsimd_sleef_acos_u10d_avx512_skylake_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_avx512_skylake_f32
#define xatan_u1 nsimd_sleef_atan_u10d_avx512_skylake_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_avx512_skylake_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_avx512_skylake_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx512_skylake_f32
#define xlog_u1 nsimd_sleef_log_u10d_avx512_skylake_f64
#define xlogf_u1 nsimd_sleef_log_u10d_avx512_skylake_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx512_skylake_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx512_skylake_f32
#define xexp nsimd_sleef_exp_u10d_avx512_skylake_f64
#define xexpf nsimd_sleef_exp_u10d_avx512_skylake_f32
#define xpow nsimd_sleef_pow_u10d_avx512_skylake_f64
#define xpowf nsimd_sleef_pow_u10d_avx512_skylake_f32
#define xsinh nsimd_sleef_sinh_u10d_avx512_skylake_f64
#define xsinhf nsimd_sleef_sinh_u10d_avx512_skylake_f32
#define xcosh nsimd_sleef_cosh_u10d_avx512_skylake_f64
#define xcoshf nsimd_sleef_cosh_u10d_avx512_skylake_f32
#define xtanh nsimd_sleef_tanh_u10d_avx512_skylake_f64
#define xtanhf nsimd_sleef_tanh_u10d_avx512_skylake_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_avx512_skylake_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx512_skylake_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_avx512_skylake_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx512_skylake_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_avx512_skylake_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx512_skylake_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx512_skylake_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx512_skylake_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx512_skylake_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx512_skylake_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx512_skylake_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx512_skylake_f32
#define xasinh nsimd_sleef_asinh_u10d_avx512_skylake_f64
#define xasinhf nsimd_sleef_asinh_u10d_avx512_skylake_f32
#define xacosh nsimd_sleef_acosh_u10d_avx512_skylake_f64
#define xacoshf nsimd_sleef_acosh_u10d_avx512_skylake_f32
#define xatanh nsimd_sleef_atanh_u10d_avx512_skylake_f64
#define xatanhf nsimd_sleef_atanh_u10d_avx512_skylake_f32
#define xexp2 nsimd_sleef_exp2_u10d_avx512_skylake_f64
#define xexp2f nsimd_sleef_exp2_u10d_avx512_skylake_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_avx512_skylake_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx512_skylake_f32
#define xexp10 nsimd_sleef_exp10_u10d_avx512_skylake_f64
#define xexp10f nsimd_sleef_exp10_u10d_avx512_skylake_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_avx512_skylake_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx512_skylake_f32
#define xexpm1 nsimd_sleef_expm1_u10d_avx512_skylake_f64
#define xexpm1f nsimd_sleef_expm1_u10d_avx512_skylake_f32
#define xlog10 nsimd_sleef_log10_u10d_avx512_skylake_f64
#define xlog10f nsimd_sleef_log10_u10d_avx512_skylake_f32
#define xlog2 nsimd_sleef_log2_u10d_avx512_skylake_f64
#define xlog2f nsimd_sleef_log2_u10d_avx512_skylake_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_avx512_skylake_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_avx512_skylake_f32
#define xlog1p nsimd_sleef_log1p_u10d_avx512_skylake_f64
#define xlog1pf nsimd_sleef_log1p_u10d_avx512_skylake_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx512_skylake_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx512_skylake_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx512_skylake_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx512_skylake_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx512_skylake_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx512_skylake_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_avx512_skylake_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_avx512_skylake_f32
#define xldexp nsimd_sleef_ldexp_avx512_skylake_f64
#define xldexpf nsimd_sleef_ldexp_avx512_skylake_f32
#define xilogb nsimd_sleef_ilogb_avx512_skylake_f64
#define xilogbf nsimd_sleef_ilogb_avx512_skylake_f32
#define xfma nsimd_sleef_fma_avx512_skylake_f64
#define xfmaf nsimd_sleef_fma_avx512_skylake_f32
#define xsqrt nsimd_sleef_sqrt_avx512_skylake_f64
#define xsqrtf nsimd_sleef_sqrt_avx512_skylake_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx512_skylake_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx512_skylake_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx512_skylake_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx512_skylake_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_avx512_skylake_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx512_skylake_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_avx512_skylake_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx512_skylake_f32
#define xfabs nsimd_sleef_fabs_avx512_skylake_f64
#define xfabsf nsimd_sleef_fabs_avx512_skylake_f32
#define xcopysign nsimd_sleef_copysign_avx512_skylake_f64
#define xcopysignf nsimd_sleef_copysign_avx512_skylake_f32
#define xfmax nsimd_sleef_fmax_avx512_skylake_f64
#define xfmaxf nsimd_sleef_fmax_avx512_skylake_f32
#define xfmin nsimd_sleef_fmin_avx512_skylake_f64
#define xfminf nsimd_sleef_fmin_avx512_skylake_f32
#define xfdim nsimd_sleef_fdim_avx512_skylake_f64
#define xfdimf nsimd_sleef_fdim_avx512_skylake_f32
#define xtrunc nsimd_sleef_trunc_avx512_skylake_f64
#define xtruncf nsimd_sleef_trunc_avx512_skylake_f32
#define xfloor nsimd_sleef_floor_avx512_skylake_f64
#define xfloorf nsimd_sleef_floor_avx512_skylake_f32
#define xceil nsimd_sleef_ceil_avx512_skylake_f64
#define xceilf nsimd_sleef_ceil_avx512_skylake_f32
#define xround nsimd_sleef_round_avx512_skylake_f64
#define xroundf nsimd_sleef_round_avx512_skylake_f32
#define xrint nsimd_sleef_rint_avx512_skylake_f64
#define xrintf nsimd_sleef_rint_avx512_skylake_f32
#define xnextafter nsimd_sleef_nextafter_avx512_skylake_f64
#define xnextafterf nsimd_sleef_nextafter_avx512_skylake_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx512_skylake_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx512_skylake_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx512_skylake_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx512_skylake_f32
#define xfmod nsimd_sleef_fmod_avx512_skylake_f64
#define xfmodf nsimd_sleef_fmod_avx512_skylake_f32
#define xremainder nsimd_sleef_remainder_avx512_skylake_f64
#define xremainderf nsimd_sleef_remainder_avx512_skylake_f32
#define xmodf nsimd_sleef_modf_avx512_skylake_f64
#define xmodff nsimd_sleef_modf_avx512_skylake_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx512_skylake_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx512_skylake_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx512_skylake_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx512_skylake_f32
#define xerf_u1 nsimd_sleef_erf_u10d_avx512_skylake_f64
#define xerff_u1 nsimd_sleef_erf_u10d_avx512_skylake_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_avx512_skylake_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx512_skylake_f32
#define xgetInt nsimd_sleef_getInt_avx512_skylake_f64
#define xgetIntf nsimd_sleef_getInt_avx512_skylake_f32
#define xgetPtr nsimd_sleef_getPtr_avx512_skylake_f64
#define xgetPtrf nsimd_sleef_getPtr_avx512_skylake_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_avx512_skylake_f64
#define xsinf nsimd_sleef_sin_u35_avx512_skylake_f32
#define xcos nsimd_sleef_cos_u35_avx512_skylake_f64
#define xcosf nsimd_sleef_cos_u35_avx512_skylake_f32
#define xsincos nsimd_sleef_sincos_u35_avx512_skylake_f64
#define xsincosf nsimd_sleef_sincos_u35_avx512_skylake_f32
#define xtan nsimd_sleef_tan_u35_avx512_skylake_f64
#define xtanf nsimd_sleef_tan_u35_avx512_skylake_f32
#define xasin nsimd_sleef_asin_u35_avx512_skylake_f64
#define xasinf nsimd_sleef_asin_u35_avx512_skylake_f32
#define xacos nsimd_sleef_acos_u35_avx512_skylake_f64
#define xacosf nsimd_sleef_acos_u35_avx512_skylake_f32
#define xatan nsimd_sleef_atan_u35_avx512_skylake_f64
#define xatanf nsimd_sleef_atan_u35_avx512_skylake_f32
#define xatan2 nsimd_sleef_atan2_u35_avx512_skylake_f64
#define xatan2f nsimd_sleef_atan2_u35_avx512_skylake_f32
#define xlog nsimd_sleef_log_u35_avx512_skylake_f64
#define xlogf nsimd_sleef_log_u35_avx512_skylake_f32
#define xcbrt nsimd_sleef_cbrt_u35_avx512_skylake_f64
#define xcbrtf nsimd_sleef_cbrt_u35_avx512_skylake_f32
#define xsin_u1 nsimd_sleef_sin_u10_avx512_skylake_f64
#define xsinf_u1 nsimd_sleef_sin_u10_avx512_skylake_f32
#define xcos_u1 nsimd_sleef_cos_u10_avx512_skylake_f64
#define xcosf_u1 nsimd_sleef_cos_u10_avx512_skylake_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_avx512_skylake_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_avx512_skylake_f32
#define xtan_u1 nsimd_sleef_tan_u10_avx512_skylake_f64
#define xtanf_u1 nsimd_sleef_tan_u10_avx512_skylake_f32
#define xasin_u1 nsimd_sleef_asin_u10_avx512_skylake_f64
#define xasinf_u1 nsimd_sleef_asin_u10_avx512_skylake_f32
#define xacos_u1 nsimd_sleef_acos_u10_avx512_skylake_f64
#define xacosf_u1 nsimd_sleef_acos_u10_avx512_skylake_f32
#define xatan_u1 nsimd_sleef_atan_u10_avx512_skylake_f64
#define xatanf_u1 nsimd_sleef_atan_u10_avx512_skylake_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_avx512_skylake_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_avx512_skylake_f32
#define xlog_u1 nsimd_sleef_log_u10_avx512_skylake_f64
#define xlogf_u1 nsimd_sleef_log_u10_avx512_skylake_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx512_skylake_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx512_skylake_f32
#define xexp nsimd_sleef_exp_u10_avx512_skylake_f64
#define xexpf nsimd_sleef_exp_u10_avx512_skylake_f32
#define xpow nsimd_sleef_pow_u10_avx512_skylake_f64
#define xpowf nsimd_sleef_pow_u10_avx512_skylake_f32
#define xsinh nsimd_sleef_sinh_u10_avx512_skylake_f64
#define xsinhf nsimd_sleef_sinh_u10_avx512_skylake_f32
#define xcosh nsimd_sleef_cosh_u10_avx512_skylake_f64
#define xcoshf nsimd_sleef_cosh_u10_avx512_skylake_f32
#define xtanh nsimd_sleef_tanh_u10_avx512_skylake_f64
#define xtanhf nsimd_sleef_tanh_u10_avx512_skylake_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_avx512_skylake_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_avx512_skylake_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_avx512_skylake_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_avx512_skylake_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_avx512_skylake_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_avx512_skylake_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx512_skylake_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx512_skylake_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx512_skylake_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx512_skylake_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx512_skylake_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx512_skylake_f32
#define xasinh nsimd_sleef_asinh_u10_avx512_skylake_f64
#define xasinhf nsimd_sleef_asinh_u10_avx512_skylake_f32
#define xacosh nsimd_sleef_acosh_u10_avx512_skylake_f64
#define xacoshf nsimd_sleef_acosh_u10_avx512_skylake_f32
#define xatanh nsimd_sleef_atanh_u10_avx512_skylake_f64
#define xatanhf nsimd_sleef_atanh_u10_avx512_skylake_f32
#define xexp2 nsimd_sleef_exp2_u10_avx512_skylake_f64
#define xexp2f nsimd_sleef_exp2_u10_avx512_skylake_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_avx512_skylake_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_avx512_skylake_f32
#define xexp10 nsimd_sleef_exp10_u10_avx512_skylake_f64
#define xexp10f nsimd_sleef_exp10_u10_avx512_skylake_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_avx512_skylake_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_avx512_skylake_f32
#define xexpm1 nsimd_sleef_expm1_u10_avx512_skylake_f64
#define xexpm1f nsimd_sleef_expm1_u10_avx512_skylake_f32
#define xlog10 nsimd_sleef_log10_u10_avx512_skylake_f64
#define xlog10f nsimd_sleef_log10_u10_avx512_skylake_f32
#define xlog2 nsimd_sleef_log2_u10_avx512_skylake_f64
#define xlog2f nsimd_sleef_log2_u10_avx512_skylake_f32
#define xlog2_u35 nsimd_sleef_log2_u35_avx512_skylake_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_avx512_skylake_f32
#define xlog1p nsimd_sleef_log1p_u10_avx512_skylake_f64
#define xlog1pf nsimd_sleef_log1p_u10_avx512_skylake_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx512_skylake_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx512_skylake_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx512_skylake_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx512_skylake_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx512_skylake_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx512_skylake_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_avx512_skylake_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_avx512_skylake_f32
#define xldexp nsimd_sleef_ldexp_avx512_skylake_f64
#define xldexpf nsimd_sleef_ldexp_avx512_skylake_f32
#define xilogb nsimd_sleef_ilogb_avx512_skylake_f64
#define xilogbf nsimd_sleef_ilogb_avx512_skylake_f32
#define xfma nsimd_sleef_fma_avx512_skylake_f64
#define xfmaf nsimd_sleef_fma_avx512_skylake_f32
#define xsqrt nsimd_sleef_sqrt_avx512_skylake_f64
#define xsqrtf nsimd_sleef_sqrt_avx512_skylake_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx512_skylake_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx512_skylake_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx512_skylake_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx512_skylake_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_avx512_skylake_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_avx512_skylake_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_avx512_skylake_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_avx512_skylake_f32
#define xfabs nsimd_sleef_fabs_avx512_skylake_f64
#define xfabsf nsimd_sleef_fabs_avx512_skylake_f32
#define xcopysign nsimd_sleef_copysign_avx512_skylake_f64
#define xcopysignf nsimd_sleef_copysign_avx512_skylake_f32
#define xfmax nsimd_sleef_fmax_avx512_skylake_f64
#define xfmaxf nsimd_sleef_fmax_avx512_skylake_f32
#define xfmin nsimd_sleef_fmin_avx512_skylake_f64
#define xfminf nsimd_sleef_fmin_avx512_skylake_f32
#define xfdim nsimd_sleef_fdim_avx512_skylake_f64
#define xfdimf nsimd_sleef_fdim_avx512_skylake_f32
#define xtrunc nsimd_sleef_trunc_avx512_skylake_f64
#define xtruncf nsimd_sleef_trunc_avx512_skylake_f32
#define xfloor nsimd_sleef_floor_avx512_skylake_f64
#define xfloorf nsimd_sleef_floor_avx512_skylake_f32
#define xceil nsimd_sleef_ceil_avx512_skylake_f64
#define xceilf nsimd_sleef_ceil_avx512_skylake_f32
#define xround nsimd_sleef_round_avx512_skylake_f64
#define xroundf nsimd_sleef_round_avx512_skylake_f32
#define xrint nsimd_sleef_rint_avx512_skylake_f64
#define xrintf nsimd_sleef_rint_avx512_skylake_f32
#define xnextafter nsimd_sleef_nextafter_avx512_skylake_f64
#define xnextafterf nsimd_sleef_nextafter_avx512_skylake_f32
#define xfrfrexp nsimd_sleef_frfrexp_avx512_skylake_f64
#define xfrfrexpf nsimd_sleef_frfrexp_avx512_skylake_f32
#define xexpfrexp nsimd_sleef_expfrexp_avx512_skylake_f64
#define xexpfrexpf nsimd_sleef_expfrexp_avx512_skylake_f32
#define xfmod nsimd_sleef_fmod_avx512_skylake_f64
#define xfmodf nsimd_sleef_fmod_avx512_skylake_f32
#define xremainder nsimd_sleef_remainder_avx512_skylake_f64
#define xremainderf nsimd_sleef_remainder_avx512_skylake_f32
#define xmodf nsimd_sleef_modf_avx512_skylake_f64
#define xmodff nsimd_sleef_modf_avx512_skylake_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx512_skylake_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx512_skylake_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx512_skylake_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx512_skylake_f32
#define xerf_u1 nsimd_sleef_erf_u10_avx512_skylake_f64
#define xerff_u1 nsimd_sleef_erf_u10_avx512_skylake_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_avx512_skylake_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_avx512_skylake_f32
#define xgetInt nsimd_sleef_getInt_avx512_skylake_f64
#define xgetIntf nsimd_sleef_getInt_avx512_skylake_f32
#define xgetPtr nsimd_sleef_getPtr_avx512_skylake_f64
#define xgetPtrf nsimd_sleef_getPtr_avx512_skylake_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_avx512_skylake
                   #define rempif nsimd_sleef_rempif_avx512_skylake
                   #define rempisub nsimd_sleef_rempisub_avx512_skylake
                   #define rempisubf nsimd_sleef_rempisubf_avx512_skylake
                   #define gammak nsimd_gammak_avx512_skylake
                   #define gammafk nsimd_gammafk_avx512_skylake

                   #endif

                   
#endif


================================================
FILE: src/renameneon32.h
================================================
#ifndef RENAMENEON32_H
               #define RENAMENEON32_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions neon128 */

                   #ifdef NSIMD_NEON128

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_neon128_f64
#define xsinf nsimd_sleef_sin_u35d_neon128_f32
#define xcos nsimd_sleef_cos_u35d_neon128_f64
#define xcosf nsimd_sleef_cos_u35d_neon128_f32
#define xsincos nsimd_sleef_sincos_u35d_neon128_f64
#define xsincosf nsimd_sleef_sincos_u35d_neon128_f32
#define xtan nsimd_sleef_tan_u35d_neon128_f64
#define xtanf nsimd_sleef_tan_u35d_neon128_f32
#define xasin nsimd_sleef_asin_u35d_neon128_f64
#define xasinf nsimd_sleef_asin_u35d_neon128_f32
#define xacos nsimd_sleef_acos_u35d_neon128_f64
#define xacosf nsimd_sleef_acos_u35d_neon128_f32
#define xatan nsimd_sleef_atan_u35d_neon128_f64
#define xatanf nsimd_sleef_atan_u35d_neon128_f32
#define xatan2 nsimd_sleef_atan2_u35d_neon128_f64
#define xatan2f nsimd_sleef_atan2_u35d_neon128_f32
#define xlog nsimd_sleef_log_u35d_neon128_f64
#define xlogf nsimd_sleef_log_u35d_neon128_f32
#define xcbrt nsimd_sleef_cbrt_u35d_neon128_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_neon128_f32
#define xsin_u1 nsimd_sleef_sin_u10d_neon128_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_neon128_f32
#define xcos_u1 nsimd_sleef_cos_u10d_neon128_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_neon128_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_neon128_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_neon128_f32
#define xtan_u1 nsimd_sleef_tan_u10d_neon128_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_neon128_f32
#define xasin_u1 nsimd_sleef_asin_u10d_neon128_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_neon128_f32
#define xacos_u1 nsimd_sleef_acos_u10d_neon128_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_neon128_f32
#define xatan_u1 nsimd_sleef_atan_u10d_neon128_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_neon128_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_neon128_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_neon128_f32
#define xlog_u1 nsimd_sleef_log_u10d_neon128_f64
#define xlogf_u1 nsimd_sleef_log_u10d_neon128_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_neon128_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_neon128_f32
#define xexp nsimd_sleef_exp_u10d_neon128_f64
#define xexpf nsimd_sleef_exp_u10d_neon128_f32
#define xpow nsimd_sleef_pow_u10d_neon128_f64
#define xpowf nsimd_sleef_pow_u10d_neon128_f32
#define xsinh nsimd_sleef_sinh_u10d_neon128_f64
#define xsinhf nsimd_sleef_sinh_u10d_neon128_f32
#define xcosh nsimd_sleef_cosh_u10d_neon128_f64
#define xcoshf nsimd_sleef_cosh_u10d_neon128_f32
#define xtanh nsimd_sleef_tanh_u10d_neon128_f64
#define xtanhf nsimd_sleef_tanh_u10d_neon128_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_neon128_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_neon128_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_neon128_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_neon128_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_neon128_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_neon128_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_neon128_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_neon128_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_neon128_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_neon128_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_neon128_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_neon128_f32
#define xasinh nsimd_sleef_asinh_u10d_neon128_f64
#define xasinhf nsimd_sleef_asinh_u10d_neon128_f32
#define xacosh nsimd_sleef_acosh_u10d_neon128_f64
#define xacoshf nsimd_sleef_acosh_u10d_neon128_f32
#define xatanh nsimd_sleef_atanh_u10d_neon128_f64
#define xatanhf nsimd_sleef_atanh_u10d_neon128_f32
#define xexp2 nsimd_sleef_exp2_u10d_neon128_f64
#define xexp2f nsimd_sleef_exp2_u10d_neon128_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_neon128_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_neon128_f32
#define xexp10 nsimd_sleef_exp10_u10d_neon128_f64
#define xexp10f nsimd_sleef_exp10_u10d_neon128_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_neon128_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_neon128_f32
#define xexpm1 nsimd_sleef_expm1_u10d_neon128_f64
#define xexpm1f nsimd_sleef_expm1_u10d_neon128_f32
#define xlog10 nsimd_sleef_log10_u10d_neon128_f64
#define xlog10f nsimd_sleef_log10_u10d_neon128_f32
#define xlog2 nsimd_sleef_log2_u10d_neon128_f64
#define xlog2f nsimd_sleef_log2_u10d_neon128_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_neon128_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_neon128_f32
#define xlog1p nsimd_sleef_log1p_u10d_neon128_f64
#define xlog1pf nsimd_sleef_log1p_u10d_neon128_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_neon128_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_neon128_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_neon128_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_neon128_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_neon128_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_neon128_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_neon128_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_neon128_f32
#define xldexp nsimd_sleef_ldexp_neon128_f64
#define xldexpf nsimd_sleef_ldexp_neon128_f32
#define xilogb nsimd_sleef_ilogb_neon128_f64
#define xilogbf nsimd_sleef_ilogb_neon128_f32
#define xfma nsimd_sleef_fma_neon128_f64
#define xfmaf nsimd_sleef_fma_neon128_f32
#define xsqrt nsimd_sleef_sqrt_neon128_f64
#define xsqrtf nsimd_sleef_sqrt_neon128_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_neon128_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_neon128_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_neon128_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_neon128_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_neon128_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_neon128_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_neon128_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_neon128_f32
#define xfabs nsimd_sleef_fabs_neon128_f64
#define xfabsf nsimd_sleef_fabs_neon128_f32
#define xcopysign nsimd_sleef_copysign_neon128_f64
#define xcopysignf nsimd_sleef_copysign_neon128_f32
#define xfmax nsimd_sleef_fmax_neon128_f64
#define xfmaxf nsimd_sleef_fmax_neon128_f32
#define xfmin nsimd_sleef_fmin_neon128_f64
#define xfminf nsimd_sleef_fmin_neon128_f32
#define xfdim nsimd_sleef_fdim_neon128_f64
#define xfdimf nsimd_sleef_fdim_neon128_f32
#define xtrunc nsimd_sleef_trunc_neon128_f64
#define xtruncf nsimd_sleef_trunc_neon128_f32
#define xfloor nsimd_sleef_floor_neon128_f64
#define xfloorf nsimd_sleef_floor_neon128_f32
#define xceil nsimd_sleef_ceil_neon128_f64
#define xceilf nsimd_sleef_ceil_neon128_f32
#define xround nsimd_sleef_round_neon128_f64
#define xroundf nsimd_sleef_round_neon128_f32
#define xrint nsimd_sleef_rint_neon128_f64
#define xrintf nsimd_sleef_rint_neon128_f32
#define xnextafter nsimd_sleef_nextafter_neon128_f64
#define xnextafterf nsimd_sleef_nextafter_neon128_f32
#define xfrfrexp nsimd_sleef_frfrexp_neon128_f64
#define xfrfrexpf nsimd_sleef_frfrexp_neon128_f32
#define xexpfrexp nsimd_sleef_expfrexp_neon128_f64
#define xexpfrexpf nsimd_sleef_expfrexp_neon128_f32
#define xfmod nsimd_sleef_fmod_neon128_f64
#define xfmodf nsimd_sleef_fmod_neon128_f32
#define xremainder nsimd_sleef_remainder_neon128_f64
#define xremainderf nsimd_sleef_remainder_neon128_f32
#define xmodf nsimd_sleef_modf_neon128_f64
#define xmodff nsimd_sleef_modf_neon128_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_neon128_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_neon128_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_neon128_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_neon128_f32
#define xerf_u1 nsimd_sleef_erf_u10d_neon128_f64
#define xerff_u1 nsimd_sleef_erf_u10d_neon128_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_neon128_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_neon128_f32
#define xgetInt nsimd_sleef_getInt_neon128_f64
#define xgetIntf nsimd_sleef_getInt_neon128_f32
#define xgetPtr nsimd_sleef_getPtr_neon128_f64
#define xgetPtrf nsimd_sleef_getPtr_neon128_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_neon128_f64
#define xsinf nsimd_sleef_sin_u35_neon128_f32
#define xcos nsimd_sleef_cos_u35_neon128_f64
#define xcosf nsimd_sleef_cos_u35_neon128_f32
#define xsincos nsimd_sleef_sincos_u35_neon128_f64
#define xsincosf nsimd_sleef_sincos_u35_neon128_f32
#define xtan nsimd_sleef_tan_u35_neon128_f64
#define xtanf nsimd_sleef_tan_u35_neon128_f32
#define xasin nsimd_sleef_asin_u35_neon128_f64
#define xasinf nsimd_sleef_asin_u35_neon128_f32
#define xacos nsimd_sleef_acos_u35_neon128_f64
#define xacosf nsimd_sleef_acos_u35_neon128_f32
#define xatan nsimd_sleef_atan_u35_neon128_f64
#define xatanf nsimd_sleef_atan_u35_neon128_f32
#define xatan2 nsimd_sleef_atan2_u35_neon128_f64
#define xatan2f nsimd_sleef_atan2_u35_neon128_f32
#define xlog nsimd_sleef_log_u35_neon128_f64
#define xlogf nsimd_sleef_log_u35_neon128_f32
#define xcbrt nsimd_sleef_cbrt_u35_neon128_f64
#define xcbrtf nsimd_sleef_cbrt_u35_neon128_f32
#define xsin_u1 nsimd_sleef_sin_u10_neon128_f64
#define xsinf_u1 nsimd_sleef_sin_u10_neon128_f32
#define xcos_u1 nsimd_sleef_cos_u10_neon128_f64
#define xcosf_u1 nsimd_sleef_cos_u10_neon128_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_neon128_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_neon128_f32
#define xtan_u1 nsimd_sleef_tan_u10_neon128_f64
#define xtanf_u1 nsimd_sleef_tan_u10_neon128_f32
#define xasin_u1 nsimd_sleef_asin_u10_neon128_f64
#define xasinf_u1 nsimd_sleef_asin_u10_neon128_f32
#define xacos_u1 nsimd_sleef_acos_u10_neon128_f64
#define xacosf_u1 nsimd_sleef_acos_u10_neon128_f32
#define xatan_u1 nsimd_sleef_atan_u10_neon128_f64
#define xatanf_u1 nsimd_sleef_atan_u10_neon128_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_neon128_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_neon128_f32
#define xlog_u1 nsimd_sleef_log_u10_neon128_f64
#define xlogf_u1 nsimd_sleef_log_u10_neon128_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_neon128_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_neon128_f32
#define xexp nsimd_sleef_exp_u10_neon128_f64
#define xexpf nsimd_sleef_exp_u10_neon128_f32
#define xpow nsimd_sleef_pow_u10_neon128_f64
#define xpowf nsimd_sleef_pow_u10_neon128_f32
#define xsinh nsimd_sleef_sinh_u10_neon128_f64
#define xsinhf nsimd_sleef_sinh_u10_neon128_f32
#define xcosh nsimd_sleef_cosh_u10_neon128_f64
#define xcoshf nsimd_sleef_cosh_u10_neon128_f32
#define xtanh nsimd_sleef_tanh_u10_neon128_f64
#define xtanhf nsimd_sleef_tanh_u10_neon128_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_neon128_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_neon128_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_neon128_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_neon128_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_neon128_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_neon128_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_neon128_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_neon128_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_neon128_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_neon128_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_neon128_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_neon128_f32
#define xasinh nsimd_sleef_asinh_u10_neon128_f64
#define xasinhf nsimd_sleef_asinh_u10_neon128_f32
#define xacosh nsimd_sleef_acosh_u10_neon128_f64
#define xacoshf nsimd_sleef_acosh_u10_neon128_f32
#define xatanh nsimd_sleef_atanh_u10_neon128_f64
#define xatanhf nsimd_sleef_atanh_u10_neon128_f32
#define xexp2 nsimd_sleef_exp2_u10_neon128_f64
#define xexp2f nsimd_sleef_exp2_u10_neon128_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_neon128_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_neon128_f32
#define xexp10 nsimd_sleef_exp10_u10_neon128_f64
#define xexp10f nsimd_sleef_exp10_u10_neon128_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_neon128_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_neon128_f32
#define xexpm1 nsimd_sleef_expm1_u10_neon128_f64
#define xexpm1f nsimd_sleef_expm1_u10_neon128_f32
#define xlog10 nsimd_sleef_log10_u10_neon128_f64
#define xlog10f nsimd_sleef_log10_u10_neon128_f32
#define xlog2 nsimd_sleef_log2_u10_neon128_f64
#define xlog2f nsimd_sleef_log2_u10_neon128_f32
#define xlog2_u35 nsimd_sleef_log2_u35_neon128_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_neon128_f32
#define xlog1p nsimd_sleef_log1p_u10_neon128_f64
#define xlog1pf nsimd_sleef_log1p_u10_neon128_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_neon128_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_neon128_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_neon128_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_neon128_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_neon128_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_neon128_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_neon128_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_neon128_f32
#define xldexp nsimd_sleef_ldexp_neon128_f64
#define xldexpf nsimd_sleef_ldexp_neon128_f32
#define xilogb nsimd_sleef_ilogb_neon128_f64
#define xilogbf nsimd_sleef_ilogb_neon128_f32
#define xfma nsimd_sleef_fma_neon128_f64
#define xfmaf nsimd_sleef_fma_neon128_f32
#define xsqrt nsimd_sleef_sqrt_neon128_f64
#define xsqrtf nsimd_sleef_sqrt_neon128_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_neon128_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_neon128_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_neon128_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_neon128_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_neon128_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_neon128_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_neon128_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_neon128_f32
#define xfabs nsimd_sleef_fabs_neon128_f64
#define xfabsf nsimd_sleef_fabs_neon128_f32
#define xcopysign nsimd_sleef_copysign_neon128_f64
#define xcopysignf nsimd_sleef_copysign_neon128_f32
#define xfmax nsimd_sleef_fmax_neon128_f64
#define xfmaxf nsimd_sleef_fmax_neon128_f32
#define xfmin nsimd_sleef_fmin_neon128_f64
#define xfminf nsimd_sleef_fmin_neon128_f32
#define xfdim nsimd_sleef_fdim_neon128_f64
#define xfdimf nsimd_sleef_fdim_neon128_f32
#define xtrunc nsimd_sleef_trunc_neon128_f64
#define xtruncf nsimd_sleef_trunc_neon128_f32
#define xfloor nsimd_sleef_floor_neon128_f64
#define xfloorf nsimd_sleef_floor_neon128_f32
#define xceil nsimd_sleef_ceil_neon128_f64
#define xceilf nsimd_sleef_ceil_neon128_f32
#define xround nsimd_sleef_round_neon128_f64
#define xroundf nsimd_sleef_round_neon128_f32
#define xrint nsimd_sleef_rint_neon128_f64
#define xrintf nsimd_sleef_rint_neon128_f32
#define xnextafter nsimd_sleef_nextafter_neon128_f64
#define xnextafterf nsimd_sleef_nextafter_neon128_f32
#define xfrfrexp nsimd_sleef_frfrexp_neon128_f64
#define xfrfrexpf nsimd_sleef_frfrexp_neon128_f32
#define xexpfrexp nsimd_sleef_expfrexp_neon128_f64
#define xexpfrexpf nsimd_sleef_expfrexp_neon128_f32
#define xfmod nsimd_sleef_fmod_neon128_f64
#define xfmodf nsimd_sleef_fmod_neon128_f32
#define xremainder nsimd_sleef_remainder_neon128_f64
#define xremainderf nsimd_sleef_remainder_neon128_f32
#define xmodf nsimd_sleef_modf_neon128_f64
#define xmodff nsimd_sleef_modf_neon128_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_neon128_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_neon128_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_neon128_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_neon128_f32
#define xerf_u1 nsimd_sleef_erf_u10_neon128_f64
#define xerff_u1 nsimd_sleef_erf_u10_neon128_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_neon128_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_neon128_f32
#define xgetInt nsimd_sleef_getInt_neon128_f64
#define xgetIntf nsimd_sleef_getInt_neon128_f32
#define xgetPtr nsimd_sleef_getPtr_neon128_f64
#define xgetPtrf nsimd_sleef_getPtr_neon128_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_neon128
                   #define rempif nsimd_sleef_rempif_neon128
                   #define rempisub nsimd_sleef_rempisub_neon128
                   #define rempisubf nsimd_sleef_rempisubf_neon128
                   #define gammak nsimd_gammak_neon128
                   #define gammafk nsimd_gammafk_neon128

                   #endif

                   
#endif


================================================
FILE: src/renamesse2.h
================================================
#ifndef RENAMESSE2_H
               #define RENAMESSE2_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions sse2 */

                   #ifdef NSIMD_SSE2

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sse2_f64
#define xsinf nsimd_sleef_sin_u35d_sse2_f32
#define xcos nsimd_sleef_cos_u35d_sse2_f64
#define xcosf nsimd_sleef_cos_u35d_sse2_f32
#define xsincos nsimd_sleef_sincos_u35d_sse2_f64
#define xsincosf nsimd_sleef_sincos_u35d_sse2_f32
#define xtan nsimd_sleef_tan_u35d_sse2_f64
#define xtanf nsimd_sleef_tan_u35d_sse2_f32
#define xasin nsimd_sleef_asin_u35d_sse2_f64
#define xasinf nsimd_sleef_asin_u35d_sse2_f32
#define xacos nsimd_sleef_acos_u35d_sse2_f64
#define xacosf nsimd_sleef_acos_u35d_sse2_f32
#define xatan nsimd_sleef_atan_u35d_sse2_f64
#define xatanf nsimd_sleef_atan_u35d_sse2_f32
#define xatan2 nsimd_sleef_atan2_u35d_sse2_f64
#define xatan2f nsimd_sleef_atan2_u35d_sse2_f32
#define xlog nsimd_sleef_log_u35d_sse2_f64
#define xlogf nsimd_sleef_log_u35d_sse2_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sse2_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sse2_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sse2_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sse2_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sse2_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sse2_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sse2_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sse2_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sse2_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sse2_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sse2_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sse2_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sse2_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sse2_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sse2_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sse2_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sse2_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sse2_f32
#define xlog_u1 nsimd_sleef_log_u10d_sse2_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sse2_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sse2_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sse2_f32
#define xexp nsimd_sleef_exp_u10d_sse2_f64
#define xexpf nsimd_sleef_exp_u10d_sse2_f32
#define xpow nsimd_sleef_pow_u10d_sse2_f64
#define xpowf nsimd_sleef_pow_u10d_sse2_f32
#define xsinh nsimd_sleef_sinh_u10d_sse2_f64
#define xsinhf nsimd_sleef_sinh_u10d_sse2_f32
#define xcosh nsimd_sleef_cosh_u10d_sse2_f64
#define xcoshf nsimd_sleef_cosh_u10d_sse2_f32
#define xtanh nsimd_sleef_tanh_u10d_sse2_f64
#define xtanhf nsimd_sleef_tanh_u10d_sse2_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sse2_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sse2_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sse2_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sse2_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sse2_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sse2_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sse2_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sse2_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sse2_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sse2_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sse2_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sse2_f32
#define xasinh nsimd_sleef_asinh_u10d_sse2_f64
#define xasinhf nsimd_sleef_asinh_u10d_sse2_f32
#define xacosh nsimd_sleef_acosh_u10d_sse2_f64
#define xacoshf nsimd_sleef_acosh_u10d_sse2_f32
#define xatanh nsimd_sleef_atanh_u10d_sse2_f64
#define xatanhf nsimd_sleef_atanh_u10d_sse2_f32
#define xexp2 nsimd_sleef_exp2_u10d_sse2_f64
#define xexp2f nsimd_sleef_exp2_u10d_sse2_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sse2_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sse2_f32
#define xexp10 nsimd_sleef_exp10_u10d_sse2_f64
#define xexp10f nsimd_sleef_exp10_u10d_sse2_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sse2_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sse2_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sse2_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sse2_f32
#define xlog10 nsimd_sleef_log10_u10d_sse2_f64
#define xlog10f nsimd_sleef_log10_u10d_sse2_f32
#define xlog2 nsimd_sleef_log2_u10d_sse2_f64
#define xlog2f nsimd_sleef_log2_u10d_sse2_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sse2_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sse2_f32
#define xlog1p nsimd_sleef_log1p_u10d_sse2_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sse2_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sse2_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sse2_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sse2_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sse2_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sse2_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sse2_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sse2_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sse2_f32
#define xldexp nsimd_sleef_ldexp_sse2_f64
#define xldexpf nsimd_sleef_ldexp_sse2_f32
#define xilogb nsimd_sleef_ilogb_sse2_f64
#define xilogbf nsimd_sleef_ilogb_sse2_f32
#define xfma nsimd_sleef_fma_sse2_f64
#define xfmaf nsimd_sleef_fma_sse2_f32
#define xsqrt nsimd_sleef_sqrt_sse2_f64
#define xsqrtf nsimd_sleef_sqrt_sse2_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sse2_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sse2_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sse2_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sse2_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sse2_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sse2_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sse2_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sse2_f32
#define xfabs nsimd_sleef_fabs_sse2_f64
#define xfabsf nsimd_sleef_fabs_sse2_f32
#define xcopysign nsimd_sleef_copysign_sse2_f64
#define xcopysignf nsimd_sleef_copysign_sse2_f32
#define xfmax nsimd_sleef_fmax_sse2_f64
#define xfmaxf nsimd_sleef_fmax_sse2_f32
#define xfmin nsimd_sleef_fmin_sse2_f64
#define xfminf nsimd_sleef_fmin_sse2_f32
#define xfdim nsimd_sleef_fdim_sse2_f64
#define xfdimf nsimd_sleef_fdim_sse2_f32
#define xtrunc nsimd_sleef_trunc_sse2_f64
#define xtruncf nsimd_sleef_trunc_sse2_f32
#define xfloor nsimd_sleef_floor_sse2_f64
#define xfloorf nsimd_sleef_floor_sse2_f32
#define xceil nsimd_sleef_ceil_sse2_f64
#define xceilf nsimd_sleef_ceil_sse2_f32
#define xround nsimd_sleef_round_sse2_f64
#define xroundf nsimd_sleef_round_sse2_f32
#define xrint nsimd_sleef_rint_sse2_f64
#define xrintf nsimd_sleef_rint_sse2_f32
#define xnextafter nsimd_sleef_nextafter_sse2_f64
#define xnextafterf nsimd_sleef_nextafter_sse2_f32
#define xfrfrexp nsimd_sleef_frfrexp_sse2_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sse2_f32
#define xexpfrexp nsimd_sleef_expfrexp_sse2_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sse2_f32
#define xfmod nsimd_sleef_fmod_sse2_f64
#define xfmodf nsimd_sleef_fmod_sse2_f32
#define xremainder nsimd_sleef_remainder_sse2_f64
#define xremainderf nsimd_sleef_remainder_sse2_f32
#define xmodf nsimd_sleef_modf_sse2_f64
#define xmodff nsimd_sleef_modf_sse2_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sse2_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sse2_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sse2_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sse2_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sse2_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sse2_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sse2_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sse2_f32
#define xgetInt nsimd_sleef_getInt_sse2_f64
#define xgetIntf nsimd_sleef_getInt_sse2_f32
#define xgetPtr nsimd_sleef_getPtr_sse2_f64
#define xgetPtrf nsimd_sleef_getPtr_sse2_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sse2_f64
#define xsinf nsimd_sleef_sin_u35_sse2_f32
#define xcos nsimd_sleef_cos_u35_sse2_f64
#define xcosf nsimd_sleef_cos_u35_sse2_f32
#define xsincos nsimd_sleef_sincos_u35_sse2_f64
#define xsincosf nsimd_sleef_sincos_u35_sse2_f32
#define xtan nsimd_sleef_tan_u35_sse2_f64
#define xtanf nsimd_sleef_tan_u35_sse2_f32
#define xasin nsimd_sleef_asin_u35_sse2_f64
#define xasinf nsimd_sleef_asin_u35_sse2_f32
#define xacos nsimd_sleef_acos_u35_sse2_f64
#define xacosf nsimd_sleef_acos_u35_sse2_f32
#define xatan nsimd_sleef_atan_u35_sse2_f64
#define xatanf nsimd_sleef_atan_u35_sse2_f32
#define xatan2 nsimd_sleef_atan2_u35_sse2_f64
#define xatan2f nsimd_sleef_atan2_u35_sse2_f32
#define xlog nsimd_sleef_log_u35_sse2_f64
#define xlogf nsimd_sleef_log_u35_sse2_f32
#define xcbrt nsimd_sleef_cbrt_u35_sse2_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sse2_f32
#define xsin_u1 nsimd_sleef_sin_u10_sse2_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sse2_f32
#define xcos_u1 nsimd_sleef_cos_u10_sse2_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sse2_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sse2_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sse2_f32
#define xtan_u1 nsimd_sleef_tan_u10_sse2_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sse2_f32
#define xasin_u1 nsimd_sleef_asin_u10_sse2_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sse2_f32
#define xacos_u1 nsimd_sleef_acos_u10_sse2_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sse2_f32
#define xatan_u1 nsimd_sleef_atan_u10_sse2_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sse2_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sse2_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sse2_f32
#define xlog_u1 nsimd_sleef_log_u10_sse2_f64
#define xlogf_u1 nsimd_sleef_log_u10_sse2_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sse2_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sse2_f32
#define xexp nsimd_sleef_exp_u10_sse2_f64
#define xexpf nsimd_sleef_exp_u10_sse2_f32
#define xpow nsimd_sleef_pow_u10_sse2_f64
#define xpowf nsimd_sleef_pow_u10_sse2_f32
#define xsinh nsimd_sleef_sinh_u10_sse2_f64
#define xsinhf nsimd_sleef_sinh_u10_sse2_f32
#define xcosh nsimd_sleef_cosh_u10_sse2_f64
#define xcoshf nsimd_sleef_cosh_u10_sse2_f32
#define xtanh nsimd_sleef_tanh_u10_sse2_f64
#define xtanhf nsimd_sleef_tanh_u10_sse2_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sse2_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sse2_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sse2_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sse2_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sse2_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sse2_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sse2_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sse2_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sse2_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sse2_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sse2_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sse2_f32
#define xasinh nsimd_sleef_asinh_u10_sse2_f64
#define xasinhf nsimd_sleef_asinh_u10_sse2_f32
#define xacosh nsimd_sleef_acosh_u10_sse2_f64
#define xacoshf nsimd_sleef_acosh_u10_sse2_f32
#define xatanh nsimd_sleef_atanh_u10_sse2_f64
#define xatanhf nsimd_sleef_atanh_u10_sse2_f32
#define xexp2 nsimd_sleef_exp2_u10_sse2_f64
#define xexp2f nsimd_sleef_exp2_u10_sse2_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sse2_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sse2_f32
#define xexp10 nsimd_sleef_exp10_u10_sse2_f64
#define xexp10f nsimd_sleef_exp10_u10_sse2_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sse2_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sse2_f32
#define xexpm1 nsimd_sleef_expm1_u10_sse2_f64
#define xexpm1f nsimd_sleef_expm1_u10_sse2_f32
#define xlog10 nsimd_sleef_log10_u10_sse2_f64
#define xlog10f nsimd_sleef_log10_u10_sse2_f32
#define xlog2 nsimd_sleef_log2_u10_sse2_f64
#define xlog2f nsimd_sleef_log2_u10_sse2_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sse2_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sse2_f32
#define xlog1p nsimd_sleef_log1p_u10_sse2_f64
#define xlog1pf nsimd_sleef_log1p_u10_sse2_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sse2_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sse2_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sse2_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sse2_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sse2_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sse2_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sse2_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sse2_f32
#define xldexp nsimd_sleef_ldexp_sse2_f64
#define xldexpf nsimd_sleef_ldexp_sse2_f32
#define xilogb nsimd_sleef_ilogb_sse2_f64
#define xilogbf nsimd_sleef_ilogb_sse2_f32
#define xfma nsimd_sleef_fma_sse2_f64
#define xfmaf nsimd_sleef_fma_sse2_f32
#define xsqrt nsimd_sleef_sqrt_sse2_f64
#define xsqrtf nsimd_sleef_sqrt_sse2_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sse2_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sse2_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sse2_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sse2_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sse2_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sse2_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sse2_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sse2_f32
#define xfabs nsimd_sleef_fabs_sse2_f64
#define xfabsf nsimd_sleef_fabs_sse2_f32
#define xcopysign nsimd_sleef_copysign_sse2_f64
#define xcopysignf nsimd_sleef_copysign_sse2_f32
#define xfmax nsimd_sleef_fmax_sse2_f64
#define xfmaxf nsimd_sleef_fmax_sse2_f32
#define xfmin nsimd_sleef_fmin_sse2_f64
#define xfminf nsimd_sleef_fmin_sse2_f32
#define xfdim nsimd_sleef_fdim_sse2_f64
#define xfdimf nsimd_sleef_fdim_sse2_f32
#define xtrunc nsimd_sleef_trunc_sse2_f64
#define xtruncf nsimd_sleef_trunc_sse2_f32
#define xfloor nsimd_sleef_floor_sse2_f64
#define xfloorf nsimd_sleef_floor_sse2_f32
#define xceil nsimd_sleef_ceil_sse2_f64
#define xceilf nsimd_sleef_ceil_sse2_f32
#define xround nsimd_sleef_round_sse2_f64
#define xroundf nsimd_sleef_round_sse2_f32
#define xrint nsimd_sleef_rint_sse2_f64
#define xrintf nsimd_sleef_rint_sse2_f32
#define xnextafter nsimd_sleef_nextafter_sse2_f64
#define xnextafterf nsimd_sleef_nextafter_sse2_f32
#define xfrfrexp nsimd_sleef_frfrexp_sse2_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sse2_f32
#define xexpfrexp nsimd_sleef_expfrexp_sse2_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sse2_f32
#define xfmod nsimd_sleef_fmod_sse2_f64
#define xfmodf nsimd_sleef_fmod_sse2_f32
#define xremainder nsimd_sleef_remainder_sse2_f64
#define xremainderf nsimd_sleef_remainder_sse2_f32
#define xmodf nsimd_sleef_modf_sse2_f64
#define xmodff nsimd_sleef_modf_sse2_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sse2_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sse2_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sse2_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sse2_f32
#define xerf_u1 nsimd_sleef_erf_u10_sse2_f64
#define xerff_u1 nsimd_sleef_erf_u10_sse2_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sse2_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sse2_f32
#define xgetInt nsimd_sleef_getInt_sse2_f64
#define xgetIntf nsimd_sleef_getInt_sse2_f32
#define xgetPtr nsimd_sleef_getPtr_sse2_f64
#define xgetPtrf nsimd_sleef_getPtr_sse2_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sse2
                   #define rempif nsimd_sleef_rempif_sse2
                   #define rempisub nsimd_sleef_rempisub_sse2
                   #define rempisubf nsimd_sleef_rempisubf_sse2
                   #define gammak nsimd_gammak_sse2
                   #define gammafk nsimd_gammafk_sse2

                   #endif

                   
#endif


================================================
FILE: src/renamesse4.h
================================================
#ifndef RENAMESSE4_H
               #define RENAMESSE4_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions sse42 */

                   #ifdef NSIMD_SSE42

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sse42_f64
#define xsinf nsimd_sleef_sin_u35d_sse42_f32
#define xcos nsimd_sleef_cos_u35d_sse42_f64
#define xcosf nsimd_sleef_cos_u35d_sse42_f32
#define xsincos nsimd_sleef_sincos_u35d_sse42_f64
#define xsincosf nsimd_sleef_sincos_u35d_sse42_f32
#define xtan nsimd_sleef_tan_u35d_sse42_f64
#define xtanf nsimd_sleef_tan_u35d_sse42_f32
#define xasin nsimd_sleef_asin_u35d_sse42_f64
#define xasinf nsimd_sleef_asin_u35d_sse42_f32
#define xacos nsimd_sleef_acos_u35d_sse42_f64
#define xacosf nsimd_sleef_acos_u35d_sse42_f32
#define xatan nsimd_sleef_atan_u35d_sse42_f64
#define xatanf nsimd_sleef_atan_u35d_sse42_f32
#define xatan2 nsimd_sleef_atan2_u35d_sse42_f64
#define xatan2f nsimd_sleef_atan2_u35d_sse42_f32
#define xlog nsimd_sleef_log_u35d_sse42_f64
#define xlogf nsimd_sleef_log_u35d_sse42_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sse42_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sse42_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sse42_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sse42_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sse42_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sse42_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sse42_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sse42_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sse42_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sse42_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sse42_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sse42_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sse42_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sse42_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sse42_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sse42_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sse42_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sse42_f32
#define xlog_u1 nsimd_sleef_log_u10d_sse42_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sse42_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sse42_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sse42_f32
#define xexp nsimd_sleef_exp_u10d_sse42_f64
#define xexpf nsimd_sleef_exp_u10d_sse42_f32
#define xpow nsimd_sleef_pow_u10d_sse42_f64
#define xpowf nsimd_sleef_pow_u10d_sse42_f32
#define xsinh nsimd_sleef_sinh_u10d_sse42_f64
#define xsinhf nsimd_sleef_sinh_u10d_sse42_f32
#define xcosh nsimd_sleef_cosh_u10d_sse42_f64
#define xcoshf nsimd_sleef_cosh_u10d_sse42_f32
#define xtanh nsimd_sleef_tanh_u10d_sse42_f64
#define xtanhf nsimd_sleef_tanh_u10d_sse42_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sse42_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sse42_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sse42_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sse42_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sse42_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sse42_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sse42_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sse42_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sse42_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sse42_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sse42_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sse42_f32
#define xasinh nsimd_sleef_asinh_u10d_sse42_f64
#define xasinhf nsimd_sleef_asinh_u10d_sse42_f32
#define xacosh nsimd_sleef_acosh_u10d_sse42_f64
#define xacoshf nsimd_sleef_acosh_u10d_sse42_f32
#define xatanh nsimd_sleef_atanh_u10d_sse42_f64
#define xatanhf nsimd_sleef_atanh_u10d_sse42_f32
#define xexp2 nsimd_sleef_exp2_u10d_sse42_f64
#define xexp2f nsimd_sleef_exp2_u10d_sse42_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sse42_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sse42_f32
#define xexp10 nsimd_sleef_exp10_u10d_sse42_f64
#define xexp10f nsimd_sleef_exp10_u10d_sse42_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sse42_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sse42_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sse42_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sse42_f32
#define xlog10 nsimd_sleef_log10_u10d_sse42_f64
#define xlog10f nsimd_sleef_log10_u10d_sse42_f32
#define xlog2 nsimd_sleef_log2_u10d_sse42_f64
#define xlog2f nsimd_sleef_log2_u10d_sse42_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sse42_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sse42_f32
#define xlog1p nsimd_sleef_log1p_u10d_sse42_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sse42_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sse42_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sse42_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sse42_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sse42_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sse42_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sse42_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sse42_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sse42_f32
#define xldexp nsimd_sleef_ldexp_sse42_f64
#define xldexpf nsimd_sleef_ldexp_sse42_f32
#define xilogb nsimd_sleef_ilogb_sse42_f64
#define xilogbf nsimd_sleef_ilogb_sse42_f32
#define xfma nsimd_sleef_fma_sse42_f64
#define xfmaf nsimd_sleef_fma_sse42_f32
#define xsqrt nsimd_sleef_sqrt_sse42_f64
#define xsqrtf nsimd_sleef_sqrt_sse42_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sse42_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sse42_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sse42_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sse42_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sse42_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sse42_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sse42_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sse42_f32
#define xfabs nsimd_sleef_fabs_sse42_f64
#define xfabsf nsimd_sleef_fabs_sse42_f32
#define xcopysign nsimd_sleef_copysign_sse42_f64
#define xcopysignf nsimd_sleef_copysign_sse42_f32
#define xfmax nsimd_sleef_fmax_sse42_f64
#define xfmaxf nsimd_sleef_fmax_sse42_f32
#define xfmin nsimd_sleef_fmin_sse42_f64
#define xfminf nsimd_sleef_fmin_sse42_f32
#define xfdim nsimd_sleef_fdim_sse42_f64
#define xfdimf nsimd_sleef_fdim_sse42_f32
#define xtrunc nsimd_sleef_trunc_sse42_f64
#define xtruncf nsimd_sleef_trunc_sse42_f32
#define xfloor nsimd_sleef_floor_sse42_f64
#define xfloorf nsimd_sleef_floor_sse42_f32
#define xceil nsimd_sleef_ceil_sse42_f64
#define xceilf nsimd_sleef_ceil_sse42_f32
#define xround nsimd_sleef_round_sse42_f64
#define xroundf nsimd_sleef_round_sse42_f32
#define xrint nsimd_sleef_rint_sse42_f64
#define xrintf nsimd_sleef_rint_sse42_f32
#define xnextafter nsimd_sleef_nextafter_sse42_f64
#define xnextafterf nsimd_sleef_nextafter_sse42_f32
#define xfrfrexp nsimd_sleef_frfrexp_sse42_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sse42_f32
#define xexpfrexp nsimd_sleef_expfrexp_sse42_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sse42_f32
#define xfmod nsimd_sleef_fmod_sse42_f64
#define xfmodf nsimd_sleef_fmod_sse42_f32
#define xremainder nsimd_sleef_remainder_sse42_f64
#define xremainderf nsimd_sleef_remainder_sse42_f32
#define xmodf nsimd_sleef_modf_sse42_f64
#define xmodff nsimd_sleef_modf_sse42_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sse42_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sse42_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sse42_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sse42_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sse42_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sse42_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sse42_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sse42_f32
#define xgetInt nsimd_sleef_getInt_sse42_f64
#define xgetIntf nsimd_sleef_getInt_sse42_f32
#define xgetPtr nsimd_sleef_getPtr_sse42_f64
#define xgetPtrf nsimd_sleef_getPtr_sse42_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sse42_f64
#define xsinf nsimd_sleef_sin_u35_sse42_f32
#define xcos nsimd_sleef_cos_u35_sse42_f64
#define xcosf nsimd_sleef_cos_u35_sse42_f32
#define xsincos nsimd_sleef_sincos_u35_sse42_f64
#define xsincosf nsimd_sleef_sincos_u35_sse42_f32
#define xtan nsimd_sleef_tan_u35_sse42_f64
#define xtanf nsimd_sleef_tan_u35_sse42_f32
#define xasin nsimd_sleef_asin_u35_sse42_f64
#define xasinf nsimd_sleef_asin_u35_sse42_f32
#define xacos nsimd_sleef_acos_u35_sse42_f64
#define xacosf nsimd_sleef_acos_u35_sse42_f32
#define xatan nsimd_sleef_atan_u35_sse42_f64
#define xatanf nsimd_sleef_atan_u35_sse42_f32
#define xatan2 nsimd_sleef_atan2_u35_sse42_f64
#define xatan2f nsimd_sleef_atan2_u35_sse42_f32
#define xlog nsimd_sleef_log_u35_sse42_f64
#define xlogf nsimd_sleef_log_u35_sse42_f32
#define xcbrt nsimd_sleef_cbrt_u35_sse42_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sse42_f32
#define xsin_u1 nsimd_sleef_sin_u10_sse42_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sse42_f32
#define xcos_u1 nsimd_sleef_cos_u10_sse42_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sse42_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sse42_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sse42_f32
#define xtan_u1 nsimd_sleef_tan_u10_sse42_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sse42_f32
#define xasin_u1 nsimd_sleef_asin_u10_sse42_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sse42_f32
#define xacos_u1 nsimd_sleef_acos_u10_sse42_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sse42_f32
#define xatan_u1 nsimd_sleef_atan_u10_sse42_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sse42_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sse42_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sse42_f32
#define xlog_u1 nsimd_sleef_log_u10_sse42_f64
#define xlogf_u1 nsimd_sleef_log_u10_sse42_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sse42_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sse42_f32
#define xexp nsimd_sleef_exp_u10_sse42_f64
#define xexpf nsimd_sleef_exp_u10_sse42_f32
#define xpow nsimd_sleef_pow_u10_sse42_f64
#define xpowf nsimd_sleef_pow_u10_sse42_f32
#define xsinh nsimd_sleef_sinh_u10_sse42_f64
#define xsinhf nsimd_sleef_sinh_u10_sse42_f32
#define xcosh nsimd_sleef_cosh_u10_sse42_f64
#define xcoshf nsimd_sleef_cosh_u10_sse42_f32
#define xtanh nsimd_sleef_tanh_u10_sse42_f64
#define xtanhf nsimd_sleef_tanh_u10_sse42_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sse42_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sse42_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sse42_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sse42_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sse42_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sse42_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sse42_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sse42_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sse42_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sse42_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sse42_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sse42_f32
#define xasinh nsimd_sleef_asinh_u10_sse42_f64
#define xasinhf nsimd_sleef_asinh_u10_sse42_f32
#define xacosh nsimd_sleef_acosh_u10_sse42_f64
#define xacoshf nsimd_sleef_acosh_u10_sse42_f32
#define xatanh nsimd_sleef_atanh_u10_sse42_f64
#define xatanhf nsimd_sleef_atanh_u10_sse42_f32
#define xexp2 nsimd_sleef_exp2_u10_sse42_f64
#define xexp2f nsimd_sleef_exp2_u10_sse42_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sse42_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sse42_f32
#define xexp10 nsimd_sleef_exp10_u10_sse42_f64
#define xexp10f nsimd_sleef_exp10_u10_sse42_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sse42_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sse42_f32
#define xexpm1 nsimd_sleef_expm1_u10_sse42_f64
#define xexpm1f nsimd_sleef_expm1_u10_sse42_f32
#define xlog10 nsimd_sleef_log10_u10_sse42_f64
#define xlog10f nsimd_sleef_log10_u10_sse42_f32
#define xlog2 nsimd_sleef_log2_u10_sse42_f64
#define xlog2f nsimd_sleef_log2_u10_sse42_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sse42_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sse42_f32
#define xlog1p nsimd_sleef_log1p_u10_sse42_f64
#define xlog1pf nsimd_sleef_log1p_u10_sse42_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sse42_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sse42_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sse42_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sse42_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sse42_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sse42_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sse42_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sse42_f32
#define xldexp nsimd_sleef_ldexp_sse42_f64
#define xldexpf nsimd_sleef_ldexp_sse42_f32
#define xilogb nsimd_sleef_ilogb_sse42_f64
#define xilogbf nsimd_sleef_ilogb_sse42_f32
#define xfma nsimd_sleef_fma_sse42_f64
#define xfmaf nsimd_sleef_fma_sse42_f32
#define xsqrt nsimd_sleef_sqrt_sse42_f64
#define xsqrtf nsimd_sleef_sqrt_sse42_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sse42_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sse42_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sse42_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sse42_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sse42_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sse42_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sse42_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sse42_f32
#define xfabs nsimd_sleef_fabs_sse42_f64
#define xfabsf nsimd_sleef_fabs_sse42_f32
#define xcopysign nsimd_sleef_copysign_sse42_f64
#define xcopysignf nsimd_sleef_copysign_sse42_f32
#define xfmax nsimd_sleef_fmax_sse42_f64
#define xfmaxf nsimd_sleef_fmax_sse42_f32
#define xfmin nsimd_sleef_fmin_sse42_f64
#define xfminf nsimd_sleef_fmin_sse42_f32
#define xfdim nsimd_sleef_fdim_sse42_f64
#define xfdimf nsimd_sleef_fdim_sse42_f32
#define xtrunc nsimd_sleef_trunc_sse42_f64
#define xtruncf nsimd_sleef_trunc_sse42_f32
#define xfloor nsimd_sleef_floor_sse42_f64
#define xfloorf nsimd_sleef_floor_sse42_f32
#define xceil nsimd_sleef_ceil_sse42_f64
#define xceilf nsimd_sleef_ceil_sse42_f32
#define xround nsimd_sleef_round_sse42_f64
#define xroundf nsimd_sleef_round_sse42_f32
#define xrint nsimd_sleef_rint_sse42_f64
#define xrintf nsimd_sleef_rint_sse42_f32
#define xnextafter nsimd_sleef_nextafter_sse42_f64
#define xnextafterf nsimd_sleef_nextafter_sse42_f32
#define xfrfrexp nsimd_sleef_frfrexp_sse42_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sse42_f32
#define xexpfrexp nsimd_sleef_expfrexp_sse42_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sse42_f32
#define xfmod nsimd_sleef_fmod_sse42_f64
#define xfmodf nsimd_sleef_fmod_sse42_f32
#define xremainder nsimd_sleef_remainder_sse42_f64
#define xremainderf nsimd_sleef_remainder_sse42_f32
#define xmodf nsimd_sleef_modf_sse42_f64
#define xmodff nsimd_sleef_modf_sse42_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sse42_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sse42_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sse42_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sse42_f32
#define xerf_u1 nsimd_sleef_erf_u10_sse42_f64
#define xerff_u1 nsimd_sleef_erf_u10_sse42_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sse42_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sse42_f32
#define xgetInt nsimd_sleef_getInt_sse42_f64
#define xgetIntf nsimd_sleef_getInt_sse42_f32
#define xgetPtr nsimd_sleef_getPtr_sse42_f64
#define xgetPtrf nsimd_sleef_getPtr_sse42_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sse42
                   #define rempif nsimd_sleef_rempif_sse42
                   #define rempisub nsimd_sleef_rempisub_sse42
                   #define rempisubf nsimd_sleef_rempisubf_sse42
                   #define gammak nsimd_gammak_sse42
                   #define gammafk nsimd_gammafk_sse42

                   #endif

                   
#endif


================================================
FILE: src/renamesve.h
================================================
#ifndef RENAMESVE_H
               #define RENAMESVE_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions sve128 */

                   #ifdef NSIMD_SVE128

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sve128_f64
#define xsinf nsimd_sleef_sin_u35d_sve128_f32
#define xcos nsimd_sleef_cos_u35d_sve128_f64
#define xcosf nsimd_sleef_cos_u35d_sve128_f32
#define xsincos nsimd_sleef_sincos_u35d_sve128_f64
#define xsincosf nsimd_sleef_sincos_u35d_sve128_f32
#define xtan nsimd_sleef_tan_u35d_sve128_f64
#define xtanf nsimd_sleef_tan_u35d_sve128_f32
#define xasin nsimd_sleef_asin_u35d_sve128_f64
#define xasinf nsimd_sleef_asin_u35d_sve128_f32
#define xacos nsimd_sleef_acos_u35d_sve128_f64
#define xacosf nsimd_sleef_acos_u35d_sve128_f32
#define xatan nsimd_sleef_atan_u35d_sve128_f64
#define xatanf nsimd_sleef_atan_u35d_sve128_f32
#define xatan2 nsimd_sleef_atan2_u35d_sve128_f64
#define xatan2f nsimd_sleef_atan2_u35d_sve128_f32
#define xlog nsimd_sleef_log_u35d_sve128_f64
#define xlogf nsimd_sleef_log_u35d_sve128_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sve128_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sve128_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sve128_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sve128_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sve128_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sve128_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sve128_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve128_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sve128_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sve128_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sve128_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sve128_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sve128_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sve128_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sve128_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sve128_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sve128_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve128_f32
#define xlog_u1 nsimd_sleef_log_u10d_sve128_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sve128_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve128_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve128_f32
#define xexp nsimd_sleef_exp_u10d_sve128_f64
#define xexpf nsimd_sleef_exp_u10d_sve128_f32
#define xpow nsimd_sleef_pow_u10d_sve128_f64
#define xpowf nsimd_sleef_pow_u10d_sve128_f32
#define xsinh nsimd_sleef_sinh_u10d_sve128_f64
#define xsinhf nsimd_sleef_sinh_u10d_sve128_f32
#define xcosh nsimd_sleef_cosh_u10d_sve128_f64
#define xcoshf nsimd_sleef_cosh_u10d_sve128_f32
#define xtanh nsimd_sleef_tanh_u10d_sve128_f64
#define xtanhf nsimd_sleef_tanh_u10d_sve128_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sve128_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve128_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sve128_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve128_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sve128_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve128_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve128_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve128_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve128_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve128_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve128_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve128_f32
#define xasinh nsimd_sleef_asinh_u10d_sve128_f64
#define xasinhf nsimd_sleef_asinh_u10d_sve128_f32
#define xacosh nsimd_sleef_acosh_u10d_sve128_f64
#define xacoshf nsimd_sleef_acosh_u10d_sve128_f32
#define xatanh nsimd_sleef_atanh_u10d_sve128_f64
#define xatanhf nsimd_sleef_atanh_u10d_sve128_f32
#define xexp2 nsimd_sleef_exp2_u10d_sve128_f64
#define xexp2f nsimd_sleef_exp2_u10d_sve128_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sve128_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve128_f32
#define xexp10 nsimd_sleef_exp10_u10d_sve128_f64
#define xexp10f nsimd_sleef_exp10_u10d_sve128_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sve128_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve128_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sve128_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sve128_f32
#define xlog10 nsimd_sleef_log10_u10d_sve128_f64
#define xlog10f nsimd_sleef_log10_u10d_sve128_f32
#define xlog2 nsimd_sleef_log2_u10d_sve128_f64
#define xlog2f nsimd_sleef_log2_u10d_sve128_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sve128_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sve128_f32
#define xlog1p nsimd_sleef_log1p_u10d_sve128_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sve128_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve128_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve128_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve128_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve128_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve128_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve128_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sve128_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sve128_f32
#define xldexp nsimd_sleef_ldexp_sve128_f64
#define xldexpf nsimd_sleef_ldexp_sve128_f32
#define xilogb nsimd_sleef_ilogb_sve128_f64
#define xilogbf nsimd_sleef_ilogb_sve128_f32
#define xfma nsimd_sleef_fma_sve128_f64
#define xfmaf nsimd_sleef_fma_sve128_f32
#define xsqrt nsimd_sleef_sqrt_sve128_f64
#define xsqrtf nsimd_sleef_sqrt_sve128_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve128_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve128_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve128_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve128_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sve128_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve128_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sve128_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve128_f32
#define xfabs nsimd_sleef_fabs_sve128_f64
#define xfabsf nsimd_sleef_fabs_sve128_f32
#define xcopysign nsimd_sleef_copysign_sve128_f64
#define xcopysignf nsimd_sleef_copysign_sve128_f32
#define xfmax nsimd_sleef_fmax_sve128_f64
#define xfmaxf nsimd_sleef_fmax_sve128_f32
#define xfmin nsimd_sleef_fmin_sve128_f64
#define xfminf nsimd_sleef_fmin_sve128_f32
#define xfdim nsimd_sleef_fdim_sve128_f64
#define xfdimf nsimd_sleef_fdim_sve128_f32
#define xtrunc nsimd_sleef_trunc_sve128_f64
#define xtruncf nsimd_sleef_trunc_sve128_f32
#define xfloor nsimd_sleef_floor_sve128_f64
#define xfloorf nsimd_sleef_floor_sve128_f32
#define xceil nsimd_sleef_ceil_sve128_f64
#define xceilf nsimd_sleef_ceil_sve128_f32
#define xround nsimd_sleef_round_sve128_f64
#define xroundf nsimd_sleef_round_sve128_f32
#define xrint nsimd_sleef_rint_sve128_f64
#define xrintf nsimd_sleef_rint_sve128_f32
#define xnextafter nsimd_sleef_nextafter_sve128_f64
#define xnextafterf nsimd_sleef_nextafter_sve128_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve128_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve128_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve128_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve128_f32
#define xfmod nsimd_sleef_fmod_sve128_f64
#define xfmodf nsimd_sleef_fmod_sve128_f32
#define xremainder nsimd_sleef_remainder_sve128_f64
#define xremainderf nsimd_sleef_remainder_sve128_f32
#define xmodf nsimd_sleef_modf_sve128_f64
#define xmodff nsimd_sleef_modf_sve128_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve128_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve128_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve128_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve128_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sve128_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sve128_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sve128_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve128_f32
#define xgetInt nsimd_sleef_getInt_sve128_f64
#define xgetIntf nsimd_sleef_getInt_sve128_f32
#define xgetPtr nsimd_sleef_getPtr_sve128_f64
#define xgetPtrf nsimd_sleef_getPtr_sve128_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sve128_f64
#define xsinf nsimd_sleef_sin_u35_sve128_f32
#define xcos nsimd_sleef_cos_u35_sve128_f64
#define xcosf nsimd_sleef_cos_u35_sve128_f32
#define xsincos nsimd_sleef_sincos_u35_sve128_f64
#define xsincosf nsimd_sleef_sincos_u35_sve128_f32
#define xtan nsimd_sleef_tan_u35_sve128_f64
#define xtanf nsimd_sleef_tan_u35_sve128_f32
#define xasin nsimd_sleef_asin_u35_sve128_f64
#define xasinf nsimd_sleef_asin_u35_sve128_f32
#define xacos nsimd_sleef_acos_u35_sve128_f64
#define xacosf nsimd_sleef_acos_u35_sve128_f32
#define xatan nsimd_sleef_atan_u35_sve128_f64
#define xatanf nsimd_sleef_atan_u35_sve128_f32
#define xatan2 nsimd_sleef_atan2_u35_sve128_f64
#define xatan2f nsimd_sleef_atan2_u35_sve128_f32
#define xlog nsimd_sleef_log_u35_sve128_f64
#define xlogf nsimd_sleef_log_u35_sve128_f32
#define xcbrt nsimd_sleef_cbrt_u35_sve128_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sve128_f32
#define xsin_u1 nsimd_sleef_sin_u10_sve128_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sve128_f32
#define xcos_u1 nsimd_sleef_cos_u10_sve128_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sve128_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sve128_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sve128_f32
#define xtan_u1 nsimd_sleef_tan_u10_sve128_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sve128_f32
#define xasin_u1 nsimd_sleef_asin_u10_sve128_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sve128_f32
#define xacos_u1 nsimd_sleef_acos_u10_sve128_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sve128_f32
#define xatan_u1 nsimd_sleef_atan_u10_sve128_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sve128_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sve128_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sve128_f32
#define xlog_u1 nsimd_sleef_log_u10_sve128_f64
#define xlogf_u1 nsimd_sleef_log_u10_sve128_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve128_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve128_f32
#define xexp nsimd_sleef_exp_u10_sve128_f64
#define xexpf nsimd_sleef_exp_u10_sve128_f32
#define xpow nsimd_sleef_pow_u10_sve128_f64
#define xpowf nsimd_sleef_pow_u10_sve128_f32
#define xsinh nsimd_sleef_sinh_u10_sve128_f64
#define xsinhf nsimd_sleef_sinh_u10_sve128_f32
#define xcosh nsimd_sleef_cosh_u10_sve128_f64
#define xcoshf nsimd_sleef_cosh_u10_sve128_f32
#define xtanh nsimd_sleef_tanh_u10_sve128_f64
#define xtanhf nsimd_sleef_tanh_u10_sve128_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sve128_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sve128_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sve128_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sve128_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sve128_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sve128_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve128_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve128_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve128_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve128_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve128_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve128_f32
#define xasinh nsimd_sleef_asinh_u10_sve128_f64
#define xasinhf nsimd_sleef_asinh_u10_sve128_f32
#define xacosh nsimd_sleef_acosh_u10_sve128_f64
#define xacoshf nsimd_sleef_acosh_u10_sve128_f32
#define xatanh nsimd_sleef_atanh_u10_sve128_f64
#define xatanhf nsimd_sleef_atanh_u10_sve128_f32
#define xexp2 nsimd_sleef_exp2_u10_sve128_f64
#define xexp2f nsimd_sleef_exp2_u10_sve128_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sve128_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sve128_f32
#define xexp10 nsimd_sleef_exp10_u10_sve128_f64
#define xexp10f nsimd_sleef_exp10_u10_sve128_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sve128_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sve128_f32
#define xexpm1 nsimd_sleef_expm1_u10_sve128_f64
#define xexpm1f nsimd_sleef_expm1_u10_sve128_f32
#define xlog10 nsimd_sleef_log10_u10_sve128_f64
#define xlog10f nsimd_sleef_log10_u10_sve128_f32
#define xlog2 nsimd_sleef_log2_u10_sve128_f64
#define xlog2f nsimd_sleef_log2_u10_sve128_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sve128_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sve128_f32
#define xlog1p nsimd_sleef_log1p_u10_sve128_f64
#define xlog1pf nsimd_sleef_log1p_u10_sve128_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve128_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve128_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve128_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve128_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve128_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve128_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sve128_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sve128_f32
#define xldexp nsimd_sleef_ldexp_sve128_f64
#define xldexpf nsimd_sleef_ldexp_sve128_f32
#define xilogb nsimd_sleef_ilogb_sve128_f64
#define xilogbf nsimd_sleef_ilogb_sve128_f32
#define xfma nsimd_sleef_fma_sve128_f64
#define xfmaf nsimd_sleef_fma_sve128_f32
#define xsqrt nsimd_sleef_sqrt_sve128_f64
#define xsqrtf nsimd_sleef_sqrt_sve128_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve128_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve128_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve128_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve128_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sve128_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sve128_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sve128_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sve128_f32
#define xfabs nsimd_sleef_fabs_sve128_f64
#define xfabsf nsimd_sleef_fabs_sve128_f32
#define xcopysign nsimd_sleef_copysign_sve128_f64
#define xcopysignf nsimd_sleef_copysign_sve128_f32
#define xfmax nsimd_sleef_fmax_sve128_f64
#define xfmaxf nsimd_sleef_fmax_sve128_f32
#define xfmin nsimd_sleef_fmin_sve128_f64
#define xfminf nsimd_sleef_fmin_sve128_f32
#define xfdim nsimd_sleef_fdim_sve128_f64
#define xfdimf nsimd_sleef_fdim_sve128_f32
#define xtrunc nsimd_sleef_trunc_sve128_f64
#define xtruncf nsimd_sleef_trunc_sve128_f32
#define xfloor nsimd_sleef_floor_sve128_f64
#define xfloorf nsimd_sleef_floor_sve128_f32
#define xceil nsimd_sleef_ceil_sve128_f64
#define xceilf nsimd_sleef_ceil_sve128_f32
#define xround nsimd_sleef_round_sve128_f64
#define xroundf nsimd_sleef_round_sve128_f32
#define xrint nsimd_sleef_rint_sve128_f64
#define xrintf nsimd_sleef_rint_sve128_f32
#define xnextafter nsimd_sleef_nextafter_sve128_f64
#define xnextafterf nsimd_sleef_nextafter_sve128_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve128_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve128_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve128_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve128_f32
#define xfmod nsimd_sleef_fmod_sve128_f64
#define xfmodf nsimd_sleef_fmod_sve128_f32
#define xremainder nsimd_sleef_remainder_sve128_f64
#define xremainderf nsimd_sleef_remainder_sve128_f32
#define xmodf nsimd_sleef_modf_sve128_f64
#define xmodff nsimd_sleef_modf_sve128_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve128_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve128_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve128_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve128_f32
#define xerf_u1 nsimd_sleef_erf_u10_sve128_f64
#define xerff_u1 nsimd_sleef_erf_u10_sve128_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sve128_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sve128_f32
#define xgetInt nsimd_sleef_getInt_sve128_f64
#define xgetIntf nsimd_sleef_getInt_sve128_f32
#define xgetPtr nsimd_sleef_getPtr_sve128_f64
#define xgetPtrf nsimd_sleef_getPtr_sve128_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sve128
                   #define rempif nsimd_sleef_rempif_sve128
                   #define rempisub nsimd_sleef_rempisub_sve128
                   #define rempisubf nsimd_sleef_rempisubf_sve128
                   #define gammak nsimd_gammak_sve128
                   #define gammafk nsimd_gammafk_sve128

                   #endif

                   /* ------------------------------------------------------------------------- */
                   /* Naming of functions sve256 */

                   #ifdef NSIMD_SVE256

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sve256_f64
#define xsinf nsimd_sleef_sin_u35d_sve256_f32
#define xcos nsimd_sleef_cos_u35d_sve256_f64
#define xcosf nsimd_sleef_cos_u35d_sve256_f32
#define xsincos nsimd_sleef_sincos_u35d_sve256_f64
#define xsincosf nsimd_sleef_sincos_u35d_sve256_f32
#define xtan nsimd_sleef_tan_u35d_sve256_f64
#define xtanf nsimd_sleef_tan_u35d_sve256_f32
#define xasin nsimd_sleef_asin_u35d_sve256_f64
#define xasinf nsimd_sleef_asin_u35d_sve256_f32
#define xacos nsimd_sleef_acos_u35d_sve256_f64
#define xacosf nsimd_sleef_acos_u35d_sve256_f32
#define xatan nsimd_sleef_atan_u35d_sve256_f64
#define xatanf nsimd_sleef_atan_u35d_sve256_f32
#define xatan2 nsimd_sleef_atan2_u35d_sve256_f64
#define xatan2f nsimd_sleef_atan2_u35d_sve256_f32
#define xlog nsimd_sleef_log_u35d_sve256_f64
#define xlogf nsimd_sleef_log_u35d_sve256_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sve256_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sve256_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sve256_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sve256_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sve256_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sve256_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sve256_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve256_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sve256_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sve256_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sve256_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sve256_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sve256_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sve256_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sve256_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sve256_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sve256_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve256_f32
#define xlog_u1 nsimd_sleef_log_u10d_sve256_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sve256_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve256_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve256_f32
#define xexp nsimd_sleef_exp_u10d_sve256_f64
#define xexpf nsimd_sleef_exp_u10d_sve256_f32
#define xpow nsimd_sleef_pow_u10d_sve256_f64
#define xpowf nsimd_sleef_pow_u10d_sve256_f32
#define xsinh nsimd_sleef_sinh_u10d_sve256_f64
#define xsinhf nsimd_sleef_sinh_u10d_sve256_f32
#define xcosh nsimd_sleef_cosh_u10d_sve256_f64
#define xcoshf nsimd_sleef_cosh_u10d_sve256_f32
#define xtanh nsimd_sleef_tanh_u10d_sve256_f64
#define xtanhf nsimd_sleef_tanh_u10d_sve256_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sve256_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve256_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sve256_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve256_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sve256_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve256_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve256_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve256_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve256_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve256_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve256_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve256_f32
#define xasinh nsimd_sleef_asinh_u10d_sve256_f64
#define xasinhf nsimd_sleef_asinh_u10d_sve256_f32
#define xacosh nsimd_sleef_acosh_u10d_sve256_f64
#define xacoshf nsimd_sleef_acosh_u10d_sve256_f32
#define xatanh nsimd_sleef_atanh_u10d_sve256_f64
#define xatanhf nsimd_sleef_atanh_u10d_sve256_f32
#define xexp2 nsimd_sleef_exp2_u10d_sve256_f64
#define xexp2f nsimd_sleef_exp2_u10d_sve256_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sve256_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve256_f32
#define xexp10 nsimd_sleef_exp10_u10d_sve256_f64
#define xexp10f nsimd_sleef_exp10_u10d_sve256_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sve256_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve256_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sve256_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sve256_f32
#define xlog10 nsimd_sleef_log10_u10d_sve256_f64
#define xlog10f nsimd_sleef_log10_u10d_sve256_f32
#define xlog2 nsimd_sleef_log2_u10d_sve256_f64
#define xlog2f nsimd_sleef_log2_u10d_sve256_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sve256_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sve256_f32
#define xlog1p nsimd_sleef_log1p_u10d_sve256_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sve256_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve256_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve256_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve256_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve256_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve256_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve256_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sve256_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sve256_f32
#define xldexp nsimd_sleef_ldexp_sve256_f64
#define xldexpf nsimd_sleef_ldexp_sve256_f32
#define xilogb nsimd_sleef_ilogb_sve256_f64
#define xilogbf nsimd_sleef_ilogb_sve256_f32
#define xfma nsimd_sleef_fma_sve256_f64
#define xfmaf nsimd_sleef_fma_sve256_f32
#define xsqrt nsimd_sleef_sqrt_sve256_f64
#define xsqrtf nsimd_sleef_sqrt_sve256_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve256_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve256_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve256_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve256_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sve256_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve256_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sve256_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve256_f32
#define xfabs nsimd_sleef_fabs_sve256_f64
#define xfabsf nsimd_sleef_fabs_sve256_f32
#define xcopysign nsimd_sleef_copysign_sve256_f64
#define xcopysignf nsimd_sleef_copysign_sve256_f32
#define xfmax nsimd_sleef_fmax_sve256_f64
#define xfmaxf nsimd_sleef_fmax_sve256_f32
#define xfmin nsimd_sleef_fmin_sve256_f64
#define xfminf nsimd_sleef_fmin_sve256_f32
#define xfdim nsimd_sleef_fdim_sve256_f64
#define xfdimf nsimd_sleef_fdim_sve256_f32
#define xtrunc nsimd_sleef_trunc_sve256_f64
#define xtruncf nsimd_sleef_trunc_sve256_f32
#define xfloor nsimd_sleef_floor_sve256_f64
#define xfloorf nsimd_sleef_floor_sve256_f32
#define xceil nsimd_sleef_ceil_sve256_f64
#define xceilf nsimd_sleef_ceil_sve256_f32
#define xround nsimd_sleef_round_sve256_f64
#define xroundf nsimd_sleef_round_sve256_f32
#define xrint nsimd_sleef_rint_sve256_f64
#define xrintf nsimd_sleef_rint_sve256_f32
#define xnextafter nsimd_sleef_nextafter_sve256_f64
#define xnextafterf nsimd_sleef_nextafter_sve256_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve256_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve256_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve256_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve256_f32
#define xfmod nsimd_sleef_fmod_sve256_f64
#define xfmodf nsimd_sleef_fmod_sve256_f32
#define xremainder nsimd_sleef_remainder_sve256_f64
#define xremainderf nsimd_sleef_remainder_sve256_f32
#define xmodf nsimd_sleef_modf_sve256_f64
#define xmodff nsimd_sleef_modf_sve256_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve256_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve256_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve256_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve256_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sve256_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sve256_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sve256_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve256_f32
#define xgetInt nsimd_sleef_getInt_sve256_f64
#define xgetIntf nsimd_sleef_getInt_sve256_f32
#define xgetPtr nsimd_sleef_getPtr_sve256_f64
#define xgetPtrf nsimd_sleef_getPtr_sve256_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sve256_f64
#define xsinf nsimd_sleef_sin_u35_sve256_f32
#define xcos nsimd_sleef_cos_u35_sve256_f64
#define xcosf nsimd_sleef_cos_u35_sve256_f32
#define xsincos nsimd_sleef_sincos_u35_sve256_f64
#define xsincosf nsimd_sleef_sincos_u35_sve256_f32
#define xtan nsimd_sleef_tan_u35_sve256_f64
#define xtanf nsimd_sleef_tan_u35_sve256_f32
#define xasin nsimd_sleef_asin_u35_sve256_f64
#define xasinf nsimd_sleef_asin_u35_sve256_f32
#define xacos nsimd_sleef_acos_u35_sve256_f64
#define xacosf nsimd_sleef_acos_u35_sve256_f32
#define xatan nsimd_sleef_atan_u35_sve256_f64
#define xatanf nsimd_sleef_atan_u35_sve256_f32
#define xatan2 nsimd_sleef_atan2_u35_sve256_f64
#define xatan2f nsimd_sleef_atan2_u35_sve256_f32
#define xlog nsimd_sleef_log_u35_sve256_f64
#define xlogf nsimd_sleef_log_u35_sve256_f32
#define xcbrt nsimd_sleef_cbrt_u35_sve256_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sve256_f32
#define xsin_u1 nsimd_sleef_sin_u10_sve256_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sve256_f32
#define xcos_u1 nsimd_sleef_cos_u10_sve256_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sve256_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sve256_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sve256_f32
#define xtan_u1 nsimd_sleef_tan_u10_sve256_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sve256_f32
#define xasin_u1 nsimd_sleef_asin_u10_sve256_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sve256_f32
#define xacos_u1 nsimd_sleef_acos_u10_sve256_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sve256_f32
#define xatan_u1 nsimd_sleef_atan_u10_sve256_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sve256_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sve256_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sve256_f32
#define xlog_u1 nsimd_sleef_log_u10_sve256_f64
#define xlogf_u1 nsimd_sleef_log_u10_sve256_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve256_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve256_f32
#define xexp nsimd_sleef_exp_u10_sve256_f64
#define xexpf nsimd_sleef_exp_u10_sve256_f32
#define xpow nsimd_sleef_pow_u10_sve256_f64
#define xpowf nsimd_sleef_pow_u10_sve256_f32
#define xsinh nsimd_sleef_sinh_u10_sve256_f64
#define xsinhf nsimd_sleef_sinh_u10_sve256_f32
#define xcosh nsimd_sleef_cosh_u10_sve256_f64
#define xcoshf nsimd_sleef_cosh_u10_sve256_f32
#define xtanh nsimd_sleef_tanh_u10_sve256_f64
#define xtanhf nsimd_sleef_tanh_u10_sve256_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sve256_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sve256_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sve256_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sve256_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sve256_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sve256_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve256_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve256_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve256_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve256_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve256_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve256_f32
#define xasinh nsimd_sleef_asinh_u10_sve256_f64
#define xasinhf nsimd_sleef_asinh_u10_sve256_f32
#define xacosh nsimd_sleef_acosh_u10_sve256_f64
#define xacoshf nsimd_sleef_acosh_u10_sve256_f32
#define xatanh nsimd_sleef_atanh_u10_sve256_f64
#define xatanhf nsimd_sleef_atanh_u10_sve256_f32
#define xexp2 nsimd_sleef_exp2_u10_sve256_f64
#define xexp2f nsimd_sleef_exp2_u10_sve256_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sve256_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sve256_f32
#define xexp10 nsimd_sleef_exp10_u10_sve256_f64
#define xexp10f nsimd_sleef_exp10_u10_sve256_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sve256_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sve256_f32
#define xexpm1 nsimd_sleef_expm1_u10_sve256_f64
#define xexpm1f nsimd_sleef_expm1_u10_sve256_f32
#define xlog10 nsimd_sleef_log10_u10_sve256_f64
#define xlog10f nsimd_sleef_log10_u10_sve256_f32
#define xlog2 nsimd_sleef_log2_u10_sve256_f64
#define xlog2f nsimd_sleef_log2_u10_sve256_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sve256_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sve256_f32
#define xlog1p nsimd_sleef_log1p_u10_sve256_f64
#define xlog1pf nsimd_sleef_log1p_u10_sve256_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve256_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve256_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve256_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve256_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve256_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve256_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sve256_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sve256_f32
#define xldexp nsimd_sleef_ldexp_sve256_f64
#define xldexpf nsimd_sleef_ldexp_sve256_f32
#define xilogb nsimd_sleef_ilogb_sve256_f64
#define xilogbf nsimd_sleef_ilogb_sve256_f32
#define xfma nsimd_sleef_fma_sve256_f64
#define xfmaf nsimd_sleef_fma_sve256_f32
#define xsqrt nsimd_sleef_sqrt_sve256_f64
#define xsqrtf nsimd_sleef_sqrt_sve256_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve256_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve256_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve256_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve256_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sve256_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sve256_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sve256_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sve256_f32
#define xfabs nsimd_sleef_fabs_sve256_f64
#define xfabsf nsimd_sleef_fabs_sve256_f32
#define xcopysign nsimd_sleef_copysign_sve256_f64
#define xcopysignf nsimd_sleef_copysign_sve256_f32
#define xfmax nsimd_sleef_fmax_sve256_f64
#define xfmaxf nsimd_sleef_fmax_sve256_f32
#define xfmin nsimd_sleef_fmin_sve256_f64
#define xfminf nsimd_sleef_fmin_sve256_f32
#define xfdim nsimd_sleef_fdim_sve256_f64
#define xfdimf nsimd_sleef_fdim_sve256_f32
#define xtrunc nsimd_sleef_trunc_sve256_f64
#define xtruncf nsimd_sleef_trunc_sve256_f32
#define xfloor nsimd_sleef_floor_sve256_f64
#define xfloorf nsimd_sleef_floor_sve256_f32
#define xceil nsimd_sleef_ceil_sve256_f64
#define xceilf nsimd_sleef_ceil_sve256_f32
#define xround nsimd_sleef_round_sve256_f64
#define xroundf nsimd_sleef_round_sve256_f32
#define xrint nsimd_sleef_rint_sve256_f64
#define xrintf nsimd_sleef_rint_sve256_f32
#define xnextafter nsimd_sleef_nextafter_sve256_f64
#define xnextafterf nsimd_sleef_nextafter_sve256_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve256_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve256_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve256_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve256_f32
#define xfmod nsimd_sleef_fmod_sve256_f64
#define xfmodf nsimd_sleef_fmod_sve256_f32
#define xremainder nsimd_sleef_remainder_sve256_f64
#define xremainderf nsimd_sleef_remainder_sve256_f32
#define xmodf nsimd_sleef_modf_sve256_f64
#define xmodff nsimd_sleef_modf_sve256_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve256_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve256_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve256_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve256_f32
#define xerf_u1 nsimd_sleef_erf_u10_sve256_f64
#define xerff_u1 nsimd_sleef_erf_u10_sve256_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sve256_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sve256_f32
#define xgetInt nsimd_sleef_getInt_sve256_f64
#define xgetIntf nsimd_sleef_getInt_sve256_f32
#define xgetPtr nsimd_sleef_getPtr_sve256_f64
#define xgetPtrf nsimd_sleef_getPtr_sve256_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sve256
                   #define rempif nsimd_sleef_rempif_sve256
                   #define rempisub nsimd_sleef_rempisub_sve256
                   #define rempisubf nsimd_sleef_rempisubf_sve256
                   #define gammak nsimd_gammak_sve256
                   #define gammafk nsimd_gammafk_sve256

                   #endif

                   /* ------------------------------------------------------------------------- */
                   /* Naming of functions sve512 */

                   #ifdef NSIMD_SVE512

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sve512_f64
#define xsinf nsimd_sleef_sin_u35d_sve512_f32
#define xcos nsimd_sleef_cos_u35d_sve512_f64
#define xcosf nsimd_sleef_cos_u35d_sve512_f32
#define xsincos nsimd_sleef_sincos_u35d_sve512_f64
#define xsincosf nsimd_sleef_sincos_u35d_sve512_f32
#define xtan nsimd_sleef_tan_u35d_sve512_f64
#define xtanf nsimd_sleef_tan_u35d_sve512_f32
#define xasin nsimd_sleef_asin_u35d_sve512_f64
#define xasinf nsimd_sleef_asin_u35d_sve512_f32
#define xacos nsimd_sleef_acos_u35d_sve512_f64
#define xacosf nsimd_sleef_acos_u35d_sve512_f32
#define xatan nsimd_sleef_atan_u35d_sve512_f64
#define xatanf nsimd_sleef_atan_u35d_sve512_f32
#define xatan2 nsimd_sleef_atan2_u35d_sve512_f64
#define xatan2f nsimd_sleef_atan2_u35d_sve512_f32
#define xlog nsimd_sleef_log_u35d_sve512_f64
#define xlogf nsimd_sleef_log_u35d_sve512_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sve512_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sve512_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sve512_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sve512_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sve512_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sve512_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sve512_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve512_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sve512_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sve512_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sve512_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sve512_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sve512_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sve512_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sve512_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sve512_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sve512_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve512_f32
#define xlog_u1 nsimd_sleef_log_u10d_sve512_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sve512_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve512_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve512_f32
#define xexp nsimd_sleef_exp_u10d_sve512_f64
#define xexpf nsimd_sleef_exp_u10d_sve512_f32
#define xpow nsimd_sleef_pow_u10d_sve512_f64
#define xpowf nsimd_sleef_pow_u10d_sve512_f32
#define xsinh nsimd_sleef_sinh_u10d_sve512_f64
#define xsinhf nsimd_sleef_sinh_u10d_sve512_f32
#define xcosh nsimd_sleef_cosh_u10d_sve512_f64
#define xcoshf nsimd_sleef_cosh_u10d_sve512_f32
#define xtanh nsimd_sleef_tanh_u10d_sve512_f64
#define xtanhf nsimd_sleef_tanh_u10d_sve512_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sve512_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve512_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sve512_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve512_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sve512_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve512_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve512_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve512_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve512_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve512_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve512_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve512_f32
#define xasinh nsimd_sleef_asinh_u10d_sve512_f64
#define xasinhf nsimd_sleef_asinh_u10d_sve512_f32
#define xacosh nsimd_sleef_acosh_u10d_sve512_f64
#define xacoshf nsimd_sleef_acosh_u10d_sve512_f32
#define xatanh nsimd_sleef_atanh_u10d_sve512_f64
#define xatanhf nsimd_sleef_atanh_u10d_sve512_f32
#define xexp2 nsimd_sleef_exp2_u10d_sve512_f64
#define xexp2f nsimd_sleef_exp2_u10d_sve512_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sve512_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve512_f32
#define xexp10 nsimd_sleef_exp10_u10d_sve512_f64
#define xexp10f nsimd_sleef_exp10_u10d_sve512_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sve512_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve512_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sve512_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sve512_f32
#define xlog10 nsimd_sleef_log10_u10d_sve512_f64
#define xlog10f nsimd_sleef_log10_u10d_sve512_f32
#define xlog2 nsimd_sleef_log2_u10d_sve512_f64
#define xlog2f nsimd_sleef_log2_u10d_sve512_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sve512_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sve512_f32
#define xlog1p nsimd_sleef_log1p_u10d_sve512_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sve512_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve512_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve512_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve512_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve512_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve512_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve512_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sve512_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sve512_f32
#define xldexp nsimd_sleef_ldexp_sve512_f64
#define xldexpf nsimd_sleef_ldexp_sve512_f32
#define xilogb nsimd_sleef_ilogb_sve512_f64
#define xilogbf nsimd_sleef_ilogb_sve512_f32
#define xfma nsimd_sleef_fma_sve512_f64
#define xfmaf nsimd_sleef_fma_sve512_f32
#define xsqrt nsimd_sleef_sqrt_sve512_f64
#define xsqrtf nsimd_sleef_sqrt_sve512_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve512_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve512_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve512_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve512_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sve512_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve512_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sve512_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve512_f32
#define xfabs nsimd_sleef_fabs_sve512_f64
#define xfabsf nsimd_sleef_fabs_sve512_f32
#define xcopysign nsimd_sleef_copysign_sve512_f64
#define xcopysignf nsimd_sleef_copysign_sve512_f32
#define xfmax nsimd_sleef_fmax_sve512_f64
#define xfmaxf nsimd_sleef_fmax_sve512_f32
#define xfmin nsimd_sleef_fmin_sve512_f64
#define xfminf nsimd_sleef_fmin_sve512_f32
#define xfdim nsimd_sleef_fdim_sve512_f64
#define xfdimf nsimd_sleef_fdim_sve512_f32
#define xtrunc nsimd_sleef_trunc_sve512_f64
#define xtruncf nsimd_sleef_trunc_sve512_f32
#define xfloor nsimd_sleef_floor_sve512_f64
#define xfloorf nsimd_sleef_floor_sve512_f32
#define xceil nsimd_sleef_ceil_sve512_f64
#define xceilf nsimd_sleef_ceil_sve512_f32
#define xround nsimd_sleef_round_sve512_f64
#define xroundf nsimd_sleef_round_sve512_f32
#define xrint nsimd_sleef_rint_sve512_f64
#define xrintf nsimd_sleef_rint_sve512_f32
#define xnextafter nsimd_sleef_nextafter_sve512_f64
#define xnextafterf nsimd_sleef_nextafter_sve512_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve512_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve512_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve512_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve512_f32
#define xfmod nsimd_sleef_fmod_sve512_f64
#define xfmodf nsimd_sleef_fmod_sve512_f32
#define xremainder nsimd_sleef_remainder_sve512_f64
#define xremainderf nsimd_sleef_remainder_sve512_f32
#define xmodf nsimd_sleef_modf_sve512_f64
#define xmodff nsimd_sleef_modf_sve512_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve512_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve512_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve512_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve512_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sve512_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sve512_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sve512_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve512_f32
#define xgetInt nsimd_sleef_getInt_sve512_f64
#define xgetIntf nsimd_sleef_getInt_sve512_f32
#define xgetPtr nsimd_sleef_getPtr_sve512_f64
#define xgetPtrf nsimd_sleef_getPtr_sve512_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sve512_f64
#define xsinf nsimd_sleef_sin_u35_sve512_f32
#define xcos nsimd_sleef_cos_u35_sve512_f64
#define xcosf nsimd_sleef_cos_u35_sve512_f32
#define xsincos nsimd_sleef_sincos_u35_sve512_f64
#define xsincosf nsimd_sleef_sincos_u35_sve512_f32
#define xtan nsimd_sleef_tan_u35_sve512_f64
#define xtanf nsimd_sleef_tan_u35_sve512_f32
#define xasin nsimd_sleef_asin_u35_sve512_f64
#define xasinf nsimd_sleef_asin_u35_sve512_f32
#define xacos nsimd_sleef_acos_u35_sve512_f64
#define xacosf nsimd_sleef_acos_u35_sve512_f32
#define xatan nsimd_sleef_atan_u35_sve512_f64
#define xatanf nsimd_sleef_atan_u35_sve512_f32
#define xatan2 nsimd_sleef_atan2_u35_sve512_f64
#define xatan2f nsimd_sleef_atan2_u35_sve512_f32
#define xlog nsimd_sleef_log_u35_sve512_f64
#define xlogf nsimd_sleef_log_u35_sve512_f32
#define xcbrt nsimd_sleef_cbrt_u35_sve512_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sve512_f32
#define xsin_u1 nsimd_sleef_sin_u10_sve512_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sve512_f32
#define xcos_u1 nsimd_sleef_cos_u10_sve512_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sve512_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sve512_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sve512_f32
#define xtan_u1 nsimd_sleef_tan_u10_sve512_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sve512_f32
#define xasin_u1 nsimd_sleef_asin_u10_sve512_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sve512_f32
#define xacos_u1 nsimd_sleef_acos_u10_sve512_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sve512_f32
#define xatan_u1 nsimd_sleef_atan_u10_sve512_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sve512_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sve512_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sve512_f32
#define xlog_u1 nsimd_sleef_log_u10_sve512_f64
#define xlogf_u1 nsimd_sleef_log_u10_sve512_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve512_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve512_f32
#define xexp nsimd_sleef_exp_u10_sve512_f64
#define xexpf nsimd_sleef_exp_u10_sve512_f32
#define xpow nsimd_sleef_pow_u10_sve512_f64
#define xpowf nsimd_sleef_pow_u10_sve512_f32
#define xsinh nsimd_sleef_sinh_u10_sve512_f64
#define xsinhf nsimd_sleef_sinh_u10_sve512_f32
#define xcosh nsimd_sleef_cosh_u10_sve512_f64
#define xcoshf nsimd_sleef_cosh_u10_sve512_f32
#define xtanh nsimd_sleef_tanh_u10_sve512_f64
#define xtanhf nsimd_sleef_tanh_u10_sve512_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sve512_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sve512_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sve512_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sve512_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sve512_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sve512_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve512_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve512_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve512_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve512_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve512_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve512_f32
#define xasinh nsimd_sleef_asinh_u10_sve512_f64
#define xasinhf nsimd_sleef_asinh_u10_sve512_f32
#define xacosh nsimd_sleef_acosh_u10_sve512_f64
#define xacoshf nsimd_sleef_acosh_u10_sve512_f32
#define xatanh nsimd_sleef_atanh_u10_sve512_f64
#define xatanhf nsimd_sleef_atanh_u10_sve512_f32
#define xexp2 nsimd_sleef_exp2_u10_sve512_f64
#define xexp2f nsimd_sleef_exp2_u10_sve512_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sve512_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sve512_f32
#define xexp10 nsimd_sleef_exp10_u10_sve512_f64
#define xexp10f nsimd_sleef_exp10_u10_sve512_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sve512_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sve512_f32
#define xexpm1 nsimd_sleef_expm1_u10_sve512_f64
#define xexpm1f nsimd_sleef_expm1_u10_sve512_f32
#define xlog10 nsimd_sleef_log10_u10_sve512_f64
#define xlog10f nsimd_sleef_log10_u10_sve512_f32
#define xlog2 nsimd_sleef_log2_u10_sve512_f64
#define xlog2f nsimd_sleef_log2_u10_sve512_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sve512_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sve512_f32
#define xlog1p nsimd_sleef_log1p_u10_sve512_f64
#define xlog1pf nsimd_sleef_log1p_u10_sve512_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve512_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve512_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve512_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve512_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve512_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve512_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sve512_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sve512_f32
#define xldexp nsimd_sleef_ldexp_sve512_f64
#define xldexpf nsimd_sleef_ldexp_sve512_f32
#define xilogb nsimd_sleef_ilogb_sve512_f64
#define xilogbf nsimd_sleef_ilogb_sve512_f32
#define xfma nsimd_sleef_fma_sve512_f64
#define xfmaf nsimd_sleef_fma_sve512_f32
#define xsqrt nsimd_sleef_sqrt_sve512_f64
#define xsqrtf nsimd_sleef_sqrt_sve512_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve512_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve512_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve512_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve512_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sve512_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sve512_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sve512_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sve512_f32
#define xfabs nsimd_sleef_fabs_sve512_f64
#define xfabsf nsimd_sleef_fabs_sve512_f32
#define xcopysign nsimd_sleef_copysign_sve512_f64
#define xcopysignf nsimd_sleef_copysign_sve512_f32
#define xfmax nsimd_sleef_fmax_sve512_f64
#define xfmaxf nsimd_sleef_fmax_sve512_f32
#define xfmin nsimd_sleef_fmin_sve512_f64
#define xfminf nsimd_sleef_fmin_sve512_f32
#define xfdim nsimd_sleef_fdim_sve512_f64
#define xfdimf nsimd_sleef_fdim_sve512_f32
#define xtrunc nsimd_sleef_trunc_sve512_f64
#define xtruncf nsimd_sleef_trunc_sve512_f32
#define xfloor nsimd_sleef_floor_sve512_f64
#define xfloorf nsimd_sleef_floor_sve512_f32
#define xceil nsimd_sleef_ceil_sve512_f64
#define xceilf nsimd_sleef_ceil_sve512_f32
#define xround nsimd_sleef_round_sve512_f64
#define xroundf nsimd_sleef_round_sve512_f32
#define xrint nsimd_sleef_rint_sve512_f64
#define xrintf nsimd_sleef_rint_sve512_f32
#define xnextafter nsimd_sleef_nextafter_sve512_f64
#define xnextafterf nsimd_sleef_nextafter_sve512_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve512_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve512_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve512_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve512_f32
#define xfmod nsimd_sleef_fmod_sve512_f64
#define xfmodf nsimd_sleef_fmod_sve512_f32
#define xremainder nsimd_sleef_remainder_sve512_f64
#define xremainderf nsimd_sleef_remainder_sve512_f32
#define xmodf nsimd_sleef_modf_sve512_f64
#define xmodff nsimd_sleef_modf_sve512_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve512_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve512_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve512_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve512_f32
#define xerf_u1 nsimd_sleef_erf_u10_sve512_f64
#define xerff_u1 nsimd_sleef_erf_u10_sve512_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sve512_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sve512_f32
#define xgetInt nsimd_sleef_getInt_sve512_f64
#define xgetIntf nsimd_sleef_getInt_sve512_f32
#define xgetPtr nsimd_sleef_getPtr_sve512_f64
#define xgetPtrf nsimd_sleef_getPtr_sve512_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sve512
                   #define rempif nsimd_sleef_rempif_sve512
                   #define rempisub nsimd_sleef_rempisub_sve512
                   #define rempisubf nsimd_sleef_rempisubf_sve512
                   #define gammak nsimd_gammak_sve512
                   #define gammafk nsimd_gammafk_sve512

                   #endif

                   /* ------------------------------------------------------------------------- */
                   /* Naming of functions sve1024 */

                   #ifdef NSIMD_SVE1024

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sve1024_f64
#define xsinf nsimd_sleef_sin_u35d_sve1024_f32
#define xcos nsimd_sleef_cos_u35d_sve1024_f64
#define xcosf nsimd_sleef_cos_u35d_sve1024_f32
#define xsincos nsimd_sleef_sincos_u35d_sve1024_f64
#define xsincosf nsimd_sleef_sincos_u35d_sve1024_f32
#define xtan nsimd_sleef_tan_u35d_sve1024_f64
#define xtanf nsimd_sleef_tan_u35d_sve1024_f32
#define xasin nsimd_sleef_asin_u35d_sve1024_f64
#define xasinf nsimd_sleef_asin_u35d_sve1024_f32
#define xacos nsimd_sleef_acos_u35d_sve1024_f64
#define xacosf nsimd_sleef_acos_u35d_sve1024_f32
#define xatan nsimd_sleef_atan_u35d_sve1024_f64
#define xatanf nsimd_sleef_atan_u35d_sve1024_f32
#define xatan2 nsimd_sleef_atan2_u35d_sve1024_f64
#define xatan2f nsimd_sleef_atan2_u35d_sve1024_f32
#define xlog nsimd_sleef_log_u35d_sve1024_f64
#define xlogf nsimd_sleef_log_u35d_sve1024_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sve1024_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sve1024_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sve1024_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sve1024_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sve1024_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sve1024_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sve1024_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve1024_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sve1024_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sve1024_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sve1024_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sve1024_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sve1024_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sve1024_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sve1024_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sve1024_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sve1024_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve1024_f32
#define xlog_u1 nsimd_sleef_log_u10d_sve1024_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sve1024_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve1024_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve1024_f32
#define xexp nsimd_sleef_exp_u10d_sve1024_f64
#define xexpf nsimd_sleef_exp_u10d_sve1024_f32
#define xpow nsimd_sleef_pow_u10d_sve1024_f64
#define xpowf nsimd_sleef_pow_u10d_sve1024_f32
#define xsinh nsimd_sleef_sinh_u10d_sve1024_f64
#define xsinhf nsimd_sleef_sinh_u10d_sve1024_f32
#define xcosh nsimd_sleef_cosh_u10d_sve1024_f64
#define xcoshf nsimd_sleef_cosh_u10d_sve1024_f32
#define xtanh nsimd_sleef_tanh_u10d_sve1024_f64
#define xtanhf nsimd_sleef_tanh_u10d_sve1024_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sve1024_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve1024_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sve1024_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve1024_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sve1024_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve1024_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve1024_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve1024_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve1024_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve1024_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve1024_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve1024_f32
#define xasinh nsimd_sleef_asinh_u10d_sve1024_f64
#define xasinhf nsimd_sleef_asinh_u10d_sve1024_f32
#define xacosh nsimd_sleef_acosh_u10d_sve1024_f64
#define xacoshf nsimd_sleef_acosh_u10d_sve1024_f32
#define xatanh nsimd_sleef_atanh_u10d_sve1024_f64
#define xatanhf nsimd_sleef_atanh_u10d_sve1024_f32
#define xexp2 nsimd_sleef_exp2_u10d_sve1024_f64
#define xexp2f nsimd_sleef_exp2_u10d_sve1024_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sve1024_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve1024_f32
#define xexp10 nsimd_sleef_exp10_u10d_sve1024_f64
#define xexp10f nsimd_sleef_exp10_u10d_sve1024_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sve1024_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve1024_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sve1024_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sve1024_f32
#define xlog10 nsimd_sleef_log10_u10d_sve1024_f64
#define xlog10f nsimd_sleef_log10_u10d_sve1024_f32
#define xlog2 nsimd_sleef_log2_u10d_sve1024_f64
#define xlog2f nsimd_sleef_log2_u10d_sve1024_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sve1024_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sve1024_f32
#define xlog1p nsimd_sleef_log1p_u10d_sve1024_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sve1024_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve1024_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve1024_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve1024_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve1024_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve1024_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve1024_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sve1024_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sve1024_f32
#define xldexp nsimd_sleef_ldexp_sve1024_f64
#define xldexpf nsimd_sleef_ldexp_sve1024_f32
#define xilogb nsimd_sleef_ilogb_sve1024_f64
#define xilogbf nsimd_sleef_ilogb_sve1024_f32
#define xfma nsimd_sleef_fma_sve1024_f64
#define xfmaf nsimd_sleef_fma_sve1024_f32
#define xsqrt nsimd_sleef_sqrt_sve1024_f64
#define xsqrtf nsimd_sleef_sqrt_sve1024_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve1024_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve1024_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve1024_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve1024_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sve1024_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve1024_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sve1024_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve1024_f32
#define xfabs nsimd_sleef_fabs_sve1024_f64
#define xfabsf nsimd_sleef_fabs_sve1024_f32
#define xcopysign nsimd_sleef_copysign_sve1024_f64
#define xcopysignf nsimd_sleef_copysign_sve1024_f32
#define xfmax nsimd_sleef_fmax_sve1024_f64
#define xfmaxf nsimd_sleef_fmax_sve1024_f32
#define xfmin nsimd_sleef_fmin_sve1024_f64
#define xfminf nsimd_sleef_fmin_sve1024_f32
#define xfdim nsimd_sleef_fdim_sve1024_f64
#define xfdimf nsimd_sleef_fdim_sve1024_f32
#define xtrunc nsimd_sleef_trunc_sve1024_f64
#define xtruncf nsimd_sleef_trunc_sve1024_f32
#define xfloor nsimd_sleef_floor_sve1024_f64
#define xfloorf nsimd_sleef_floor_sve1024_f32
#define xceil nsimd_sleef_ceil_sve1024_f64
#define xceilf nsimd_sleef_ceil_sve1024_f32
#define xround nsimd_sleef_round_sve1024_f64
#define xroundf nsimd_sleef_round_sve1024_f32
#define xrint nsimd_sleef_rint_sve1024_f64
#define xrintf nsimd_sleef_rint_sve1024_f32
#define xnextafter nsimd_sleef_nextafter_sve1024_f64
#define xnextafterf nsimd_sleef_nextafter_sve1024_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve1024_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve1024_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve1024_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve1024_f32
#define xfmod nsimd_sleef_fmod_sve1024_f64
#define xfmodf nsimd_sleef_fmod_sve1024_f32
#define xremainder nsimd_sleef_remainder_sve1024_f64
#define xremainderf nsimd_sleef_remainder_sve1024_f32
#define xmodf nsimd_sleef_modf_sve1024_f64
#define xmodff nsimd_sleef_modf_sve1024_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve1024_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve1024_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve1024_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve1024_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sve1024_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sve1024_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sve1024_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve1024_f32
#define xgetInt nsimd_sleef_getInt_sve1024_f64
#define xgetIntf nsimd_sleef_getInt_sve1024_f32
#define xgetPtr nsimd_sleef_getPtr_sve1024_f64
#define xgetPtrf nsimd_sleef_getPtr_sve1024_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sve1024_f64
#define xsinf nsimd_sleef_sin_u35_sve1024_f32
#define xcos nsimd_sleef_cos_u35_sve1024_f64
#define xcosf nsimd_sleef_cos_u35_sve1024_f32
#define xsincos nsimd_sleef_sincos_u35_sve1024_f64
#define xsincosf nsimd_sleef_sincos_u35_sve1024_f32
#define xtan nsimd_sleef_tan_u35_sve1024_f64
#define xtanf nsimd_sleef_tan_u35_sve1024_f32
#define xasin nsimd_sleef_asin_u35_sve1024_f64
#define xasinf nsimd_sleef_asin_u35_sve1024_f32
#define xacos nsimd_sleef_acos_u35_sve1024_f64
#define xacosf nsimd_sleef_acos_u35_sve1024_f32
#define xatan nsimd_sleef_atan_u35_sve1024_f64
#define xatanf nsimd_sleef_atan_u35_sve1024_f32
#define xatan2 nsimd_sleef_atan2_u35_sve1024_f64
#define xatan2f nsimd_sleef_atan2_u35_sve1024_f32
#define xlog nsimd_sleef_log_u35_sve1024_f64
#define xlogf nsimd_sleef_log_u35_sve1024_f32
#define xcbrt nsimd_sleef_cbrt_u35_sve1024_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sve1024_f32
#define xsin_u1 nsimd_sleef_sin_u10_sve1024_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sve1024_f32
#define xcos_u1 nsimd_sleef_cos_u10_sve1024_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sve1024_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sve1024_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sve1024_f32
#define xtan_u1 nsimd_sleef_tan_u10_sve1024_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sve1024_f32
#define xasin_u1 nsimd_sleef_asin_u10_sve1024_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sve1024_f32
#define xacos_u1 nsimd_sleef_acos_u10_sve1024_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sve1024_f32
#define xatan_u1 nsimd_sleef_atan_u10_sve1024_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sve1024_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sve1024_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sve1024_f32
#define xlog_u1 nsimd_sleef_log_u10_sve1024_f64
#define xlogf_u1 nsimd_sleef_log_u10_sve1024_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve1024_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve1024_f32
#define xexp nsimd_sleef_exp_u10_sve1024_f64
#define xexpf nsimd_sleef_exp_u10_sve1024_f32
#define xpow nsimd_sleef_pow_u10_sve1024_f64
#define xpowf nsimd_sleef_pow_u10_sve1024_f32
#define xsinh nsimd_sleef_sinh_u10_sve1024_f64
#define xsinhf nsimd_sleef_sinh_u10_sve1024_f32
#define xcosh nsimd_sleef_cosh_u10_sve1024_f64
#define xcoshf nsimd_sleef_cosh_u10_sve1024_f32
#define xtanh nsimd_sleef_tanh_u10_sve1024_f64
#define xtanhf nsimd_sleef_tanh_u10_sve1024_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sve1024_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sve1024_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sve1024_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sve1024_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sve1024_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sve1024_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve1024_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve1024_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve1024_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve1024_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve1024_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve1024_f32
#define xasinh nsimd_sleef_asinh_u10_sve1024_f64
#define xasinhf nsimd_sleef_asinh_u10_sve1024_f32
#define xacosh nsimd_sleef_acosh_u10_sve1024_f64
#define xacoshf nsimd_sleef_acosh_u10_sve1024_f32
#define xatanh nsimd_sleef_atanh_u10_sve1024_f64
#define xatanhf nsimd_sleef_atanh_u10_sve1024_f32
#define xexp2 nsimd_sleef_exp2_u10_sve1024_f64
#define xexp2f nsimd_sleef_exp2_u10_sve1024_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sve1024_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sve1024_f32
#define xexp10 nsimd_sleef_exp10_u10_sve1024_f64
#define xexp10f nsimd_sleef_exp10_u10_sve1024_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sve1024_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sve1024_f32
#define xexpm1 nsimd_sleef_expm1_u10_sve1024_f64
#define xexpm1f nsimd_sleef_expm1_u10_sve1024_f32
#define xlog10 nsimd_sleef_log10_u10_sve1024_f64
#define xlog10f nsimd_sleef_log10_u10_sve1024_f32
#define xlog2 nsimd_sleef_log2_u10_sve1024_f64
#define xlog2f nsimd_sleef_log2_u10_sve1024_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sve1024_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sve1024_f32
#define xlog1p nsimd_sleef_log1p_u10_sve1024_f64
#define xlog1pf nsimd_sleef_log1p_u10_sve1024_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve1024_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve1024_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve1024_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve1024_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve1024_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve1024_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sve1024_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sve1024_f32
#define xldexp nsimd_sleef_ldexp_sve1024_f64
#define xldexpf nsimd_sleef_ldexp_sve1024_f32
#define xilogb nsimd_sleef_ilogb_sve1024_f64
#define xilogbf nsimd_sleef_ilogb_sve1024_f32
#define xfma nsimd_sleef_fma_sve1024_f64
#define xfmaf nsimd_sleef_fma_sve1024_f32
#define xsqrt nsimd_sleef_sqrt_sve1024_f64
#define xsqrtf nsimd_sleef_sqrt_sve1024_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve1024_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve1024_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve1024_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve1024_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sve1024_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sve1024_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sve1024_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sve1024_f32
#define xfabs nsimd_sleef_fabs_sve1024_f64
#define xfabsf nsimd_sleef_fabs_sve1024_f32
#define xcopysign nsimd_sleef_copysign_sve1024_f64
#define xcopysignf nsimd_sleef_copysign_sve1024_f32
#define xfmax nsimd_sleef_fmax_sve1024_f64
#define xfmaxf nsimd_sleef_fmax_sve1024_f32
#define xfmin nsimd_sleef_fmin_sve1024_f64
#define xfminf nsimd_sleef_fmin_sve1024_f32
#define xfdim nsimd_sleef_fdim_sve1024_f64
#define xfdimf nsimd_sleef_fdim_sve1024_f32
#define xtrunc nsimd_sleef_trunc_sve1024_f64
#define xtruncf nsimd_sleef_trunc_sve1024_f32
#define xfloor nsimd_sleef_floor_sve1024_f64
#define xfloorf nsimd_sleef_floor_sve1024_f32
#define xceil nsimd_sleef_ceil_sve1024_f64
#define xceilf nsimd_sleef_ceil_sve1024_f32
#define xround nsimd_sleef_round_sve1024_f64
#define xroundf nsimd_sleef_round_sve1024_f32
#define xrint nsimd_sleef_rint_sve1024_f64
#define xrintf nsimd_sleef_rint_sve1024_f32
#define xnextafter nsimd_sleef_nextafter_sve1024_f64
#define xnextafterf nsimd_sleef_nextafter_sve1024_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve1024_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve1024_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve1024_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve1024_f32
#define xfmod nsimd_sleef_fmod_sve1024_f64
#define xfmodf nsimd_sleef_fmod_sve1024_f32
#define xremainder nsimd_sleef_remainder_sve1024_f64
#define xremainderf nsimd_sleef_remainder_sve1024_f32
#define xmodf nsimd_sleef_modf_sve1024_f64
#define xmodff nsimd_sleef_modf_sve1024_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve1024_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve1024_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve1024_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve1024_f32
#define xerf_u1 nsimd_sleef_erf_u10_sve1024_f64
#define xerff_u1 nsimd_sleef_erf_u10_sve1024_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sve1024_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sve1024_f32
#define xgetInt nsimd_sleef_getInt_sve1024_f64
#define xgetIntf nsimd_sleef_getInt_sve1024_f32
#define xgetPtr nsimd_sleef_getPtr_sve1024_f64
#define xgetPtrf nsimd_sleef_getPtr_sve1024_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sve1024
                   #define rempif nsimd_sleef_rempif_sve1024
                   #define rempisub nsimd_sleef_rempisub_sve1024
                   #define rempisubf nsimd_sleef_rempisubf_sve1024
                   #define gammak nsimd_gammak_sve1024
                   #define gammafk nsimd_gammafk_sve1024

                   #endif

                   /* ------------------------------------------------------------------------- */
                   /* Naming of functions sve2048 */

                   #ifdef NSIMD_SVE2048

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_sve2048_f64
#define xsinf nsimd_sleef_sin_u35d_sve2048_f32
#define xcos nsimd_sleef_cos_u35d_sve2048_f64
#define xcosf nsimd_sleef_cos_u35d_sve2048_f32
#define xsincos nsimd_sleef_sincos_u35d_sve2048_f64
#define xsincosf nsimd_sleef_sincos_u35d_sve2048_f32
#define xtan nsimd_sleef_tan_u35d_sve2048_f64
#define xtanf nsimd_sleef_tan_u35d_sve2048_f32
#define xasin nsimd_sleef_asin_u35d_sve2048_f64
#define xasinf nsimd_sleef_asin_u35d_sve2048_f32
#define xacos nsimd_sleef_acos_u35d_sve2048_f64
#define xacosf nsimd_sleef_acos_u35d_sve2048_f32
#define xatan nsimd_sleef_atan_u35d_sve2048_f64
#define xatanf nsimd_sleef_atan_u35d_sve2048_f32
#define xatan2 nsimd_sleef_atan2_u35d_sve2048_f64
#define xatan2f nsimd_sleef_atan2_u35d_sve2048_f32
#define xlog nsimd_sleef_log_u35d_sve2048_f64
#define xlogf nsimd_sleef_log_u35d_sve2048_f32
#define xcbrt nsimd_sleef_cbrt_u35d_sve2048_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_sve2048_f32
#define xsin_u1 nsimd_sleef_sin_u10d_sve2048_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_sve2048_f32
#define xcos_u1 nsimd_sleef_cos_u10d_sve2048_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_sve2048_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_sve2048_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve2048_f32
#define xtan_u1 nsimd_sleef_tan_u10d_sve2048_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_sve2048_f32
#define xasin_u1 nsimd_sleef_asin_u10d_sve2048_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_sve2048_f32
#define xacos_u1 nsimd_sleef_acos_u10d_sve2048_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_sve2048_f32
#define xatan_u1 nsimd_sleef_atan_u10d_sve2048_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_sve2048_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_sve2048_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve2048_f32
#define xlog_u1 nsimd_sleef_log_u10d_sve2048_f64
#define xlogf_u1 nsimd_sleef_log_u10d_sve2048_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve2048_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve2048_f32
#define xexp nsimd_sleef_exp_u10d_sve2048_f64
#define xexpf nsimd_sleef_exp_u10d_sve2048_f32
#define xpow nsimd_sleef_pow_u10d_sve2048_f64
#define xpowf nsimd_sleef_pow_u10d_sve2048_f32
#define xsinh nsimd_sleef_sinh_u10d_sve2048_f64
#define xsinhf nsimd_sleef_sinh_u10d_sve2048_f32
#define xcosh nsimd_sleef_cosh_u10d_sve2048_f64
#define xcoshf nsimd_sleef_cosh_u10d_sve2048_f32
#define xtanh nsimd_sleef_tanh_u10d_sve2048_f64
#define xtanhf nsimd_sleef_tanh_u10d_sve2048_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_sve2048_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve2048_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_sve2048_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve2048_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_sve2048_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve2048_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve2048_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve2048_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve2048_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve2048_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve2048_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve2048_f32
#define xasinh nsimd_sleef_asinh_u10d_sve2048_f64
#define xasinhf nsimd_sleef_asinh_u10d_sve2048_f32
#define xacosh nsimd_sleef_acosh_u10d_sve2048_f64
#define xacoshf nsimd_sleef_acosh_u10d_sve2048_f32
#define xatanh nsimd_sleef_atanh_u10d_sve2048_f64
#define xatanhf nsimd_sleef_atanh_u10d_sve2048_f32
#define xexp2 nsimd_sleef_exp2_u10d_sve2048_f64
#define xexp2f nsimd_sleef_exp2_u10d_sve2048_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_sve2048_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve2048_f32
#define xexp10 nsimd_sleef_exp10_u10d_sve2048_f64
#define xexp10f nsimd_sleef_exp10_u10d_sve2048_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_sve2048_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve2048_f32
#define xexpm1 nsimd_sleef_expm1_u10d_sve2048_f64
#define xexpm1f nsimd_sleef_expm1_u10d_sve2048_f32
#define xlog10 nsimd_sleef_log10_u10d_sve2048_f64
#define xlog10f nsimd_sleef_log10_u10d_sve2048_f32
#define xlog2 nsimd_sleef_log2_u10d_sve2048_f64
#define xlog2f nsimd_sleef_log2_u10d_sve2048_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_sve2048_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_sve2048_f32
#define xlog1p nsimd_sleef_log1p_u10d_sve2048_f64
#define xlog1pf nsimd_sleef_log1p_u10d_sve2048_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve2048_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve2048_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve2048_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve2048_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve2048_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve2048_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_sve2048_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_sve2048_f32
#define xldexp nsimd_sleef_ldexp_sve2048_f64
#define xldexpf nsimd_sleef_ldexp_sve2048_f32
#define xilogb nsimd_sleef_ilogb_sve2048_f64
#define xilogbf nsimd_sleef_ilogb_sve2048_f32
#define xfma nsimd_sleef_fma_sve2048_f64
#define xfmaf nsimd_sleef_fma_sve2048_f32
#define xsqrt nsimd_sleef_sqrt_sve2048_f64
#define xsqrtf nsimd_sleef_sqrt_sve2048_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve2048_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve2048_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve2048_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve2048_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_sve2048_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve2048_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_sve2048_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve2048_f32
#define xfabs nsimd_sleef_fabs_sve2048_f64
#define xfabsf nsimd_sleef_fabs_sve2048_f32
#define xcopysign nsimd_sleef_copysign_sve2048_f64
#define xcopysignf nsimd_sleef_copysign_sve2048_f32
#define xfmax nsimd_sleef_fmax_sve2048_f64
#define xfmaxf nsimd_sleef_fmax_sve2048_f32
#define xfmin nsimd_sleef_fmin_sve2048_f64
#define xfminf nsimd_sleef_fmin_sve2048_f32
#define xfdim nsimd_sleef_fdim_sve2048_f64
#define xfdimf nsimd_sleef_fdim_sve2048_f32
#define xtrunc nsimd_sleef_trunc_sve2048_f64
#define xtruncf nsimd_sleef_trunc_sve2048_f32
#define xfloor nsimd_sleef_floor_sve2048_f64
#define xfloorf nsimd_sleef_floor_sve2048_f32
#define xceil nsimd_sleef_ceil_sve2048_f64
#define xceilf nsimd_sleef_ceil_sve2048_f32
#define xround nsimd_sleef_round_sve2048_f64
#define xroundf nsimd_sleef_round_sve2048_f32
#define xrint nsimd_sleef_rint_sve2048_f64
#define xrintf nsimd_sleef_rint_sve2048_f32
#define xnextafter nsimd_sleef_nextafter_sve2048_f64
#define xnextafterf nsimd_sleef_nextafter_sve2048_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve2048_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve2048_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve2048_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve2048_f32
#define xfmod nsimd_sleef_fmod_sve2048_f64
#define xfmodf nsimd_sleef_fmod_sve2048_f32
#define xremainder nsimd_sleef_remainder_sve2048_f64
#define xremainderf nsimd_sleef_remainder_sve2048_f32
#define xmodf nsimd_sleef_modf_sve2048_f64
#define xmodff nsimd_sleef_modf_sve2048_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve2048_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve2048_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve2048_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve2048_f32
#define xerf_u1 nsimd_sleef_erf_u10d_sve2048_f64
#define xerff_u1 nsimd_sleef_erf_u10d_sve2048_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_sve2048_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve2048_f32
#define xgetInt nsimd_sleef_getInt_sve2048_f64
#define xgetIntf nsimd_sleef_getInt_sve2048_f32
#define xgetPtr nsimd_sleef_getPtr_sve2048_f64
#define xgetPtrf nsimd_sleef_getPtr_sve2048_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_sve2048_f64
#define xsinf nsimd_sleef_sin_u35_sve2048_f32
#define xcos nsimd_sleef_cos_u35_sve2048_f64
#define xcosf nsimd_sleef_cos_u35_sve2048_f32
#define xsincos nsimd_sleef_sincos_u35_sve2048_f64
#define xsincosf nsimd_sleef_sincos_u35_sve2048_f32
#define xtan nsimd_sleef_tan_u35_sve2048_f64
#define xtanf nsimd_sleef_tan_u35_sve2048_f32
#define xasin nsimd_sleef_asin_u35_sve2048_f64
#define xasinf nsimd_sleef_asin_u35_sve2048_f32
#define xacos nsimd_sleef_acos_u35_sve2048_f64
#define xacosf nsimd_sleef_acos_u35_sve2048_f32
#define xatan nsimd_sleef_atan_u35_sve2048_f64
#define xatanf nsimd_sleef_atan_u35_sve2048_f32
#define xatan2 nsimd_sleef_atan2_u35_sve2048_f64
#define xatan2f nsimd_sleef_atan2_u35_sve2048_f32
#define xlog nsimd_sleef_log_u35_sve2048_f64
#define xlogf nsimd_sleef_log_u35_sve2048_f32
#define xcbrt nsimd_sleef_cbrt_u35_sve2048_f64
#define xcbrtf nsimd_sleef_cbrt_u35_sve2048_f32
#define xsin_u1 nsimd_sleef_sin_u10_sve2048_f64
#define xsinf_u1 nsimd_sleef_sin_u10_sve2048_f32
#define xcos_u1 nsimd_sleef_cos_u10_sve2048_f64
#define xcosf_u1 nsimd_sleef_cos_u10_sve2048_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_sve2048_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_sve2048_f32
#define xtan_u1 nsimd_sleef_tan_u10_sve2048_f64
#define xtanf_u1 nsimd_sleef_tan_u10_sve2048_f32
#define xasin_u1 nsimd_sleef_asin_u10_sve2048_f64
#define xasinf_u1 nsimd_sleef_asin_u10_sve2048_f32
#define xacos_u1 nsimd_sleef_acos_u10_sve2048_f64
#define xacosf_u1 nsimd_sleef_acos_u10_sve2048_f32
#define xatan_u1 nsimd_sleef_atan_u10_sve2048_f64
#define xatanf_u1 nsimd_sleef_atan_u10_sve2048_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_sve2048_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_sve2048_f32
#define xlog_u1 nsimd_sleef_log_u10_sve2048_f64
#define xlogf_u1 nsimd_sleef_log_u10_sve2048_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve2048_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve2048_f32
#define xexp nsimd_sleef_exp_u10_sve2048_f64
#define xexpf nsimd_sleef_exp_u10_sve2048_f32
#define xpow nsimd_sleef_pow_u10_sve2048_f64
#define xpowf nsimd_sleef_pow_u10_sve2048_f32
#define xsinh nsimd_sleef_sinh_u10_sve2048_f64
#define xsinhf nsimd_sleef_sinh_u10_sve2048_f32
#define xcosh nsimd_sleef_cosh_u10_sve2048_f64
#define xcoshf nsimd_sleef_cosh_u10_sve2048_f32
#define xtanh nsimd_sleef_tanh_u10_sve2048_f64
#define xtanhf nsimd_sleef_tanh_u10_sve2048_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_sve2048_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_sve2048_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_sve2048_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_sve2048_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_sve2048_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_sve2048_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve2048_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve2048_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve2048_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve2048_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve2048_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve2048_f32
#define xasinh nsimd_sleef_asinh_u10_sve2048_f64
#define xasinhf nsimd_sleef_asinh_u10_sve2048_f32
#define xacosh nsimd_sleef_acosh_u10_sve2048_f64
#define xacoshf nsimd_sleef_acosh_u10_sve2048_f32
#define xatanh nsimd_sleef_atanh_u10_sve2048_f64
#define xatanhf nsimd_sleef_atanh_u10_sve2048_f32
#define xexp2 nsimd_sleef_exp2_u10_sve2048_f64
#define xexp2f nsimd_sleef_exp2_u10_sve2048_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_sve2048_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_sve2048_f32
#define xexp10 nsimd_sleef_exp10_u10_sve2048_f64
#define xexp10f nsimd_sleef_exp10_u10_sve2048_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_sve2048_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_sve2048_f32
#define xexpm1 nsimd_sleef_expm1_u10_sve2048_f64
#define xexpm1f nsimd_sleef_expm1_u10_sve2048_f32
#define xlog10 nsimd_sleef_log10_u10_sve2048_f64
#define xlog10f nsimd_sleef_log10_u10_sve2048_f32
#define xlog2 nsimd_sleef_log2_u10_sve2048_f64
#define xlog2f nsimd_sleef_log2_u10_sve2048_f32
#define xlog2_u35 nsimd_sleef_log2_u35_sve2048_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_sve2048_f32
#define xlog1p nsimd_sleef_log1p_u10_sve2048_f64
#define xlog1pf nsimd_sleef_log1p_u10_sve2048_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve2048_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve2048_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve2048_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve2048_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve2048_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve2048_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_sve2048_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_sve2048_f32
#define xldexp nsimd_sleef_ldexp_sve2048_f64
#define xldexpf nsimd_sleef_ldexp_sve2048_f32
#define xilogb nsimd_sleef_ilogb_sve2048_f64
#define xilogbf nsimd_sleef_ilogb_sve2048_f32
#define xfma nsimd_sleef_fma_sve2048_f64
#define xfmaf nsimd_sleef_fma_sve2048_f32
#define xsqrt nsimd_sleef_sqrt_sve2048_f64
#define xsqrtf nsimd_sleef_sqrt_sve2048_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve2048_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve2048_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve2048_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve2048_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_sve2048_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_sve2048_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_sve2048_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_sve2048_f32
#define xfabs nsimd_sleef_fabs_sve2048_f64
#define xfabsf nsimd_sleef_fabs_sve2048_f32
#define xcopysign nsimd_sleef_copysign_sve2048_f64
#define xcopysignf nsimd_sleef_copysign_sve2048_f32
#define xfmax nsimd_sleef_fmax_sve2048_f64
#define xfmaxf nsimd_sleef_fmax_sve2048_f32
#define xfmin nsimd_sleef_fmin_sve2048_f64
#define xfminf nsimd_sleef_fmin_sve2048_f32
#define xfdim nsimd_sleef_fdim_sve2048_f64
#define xfdimf nsimd_sleef_fdim_sve2048_f32
#define xtrunc nsimd_sleef_trunc_sve2048_f64
#define xtruncf nsimd_sleef_trunc_sve2048_f32
#define xfloor nsimd_sleef_floor_sve2048_f64
#define xfloorf nsimd_sleef_floor_sve2048_f32
#define xceil nsimd_sleef_ceil_sve2048_f64
#define xceilf nsimd_sleef_ceil_sve2048_f32
#define xround nsimd_sleef_round_sve2048_f64
#define xroundf nsimd_sleef_round_sve2048_f32
#define xrint nsimd_sleef_rint_sve2048_f64
#define xrintf nsimd_sleef_rint_sve2048_f32
#define xnextafter nsimd_sleef_nextafter_sve2048_f64
#define xnextafterf nsimd_sleef_nextafter_sve2048_f32
#define xfrfrexp nsimd_sleef_frfrexp_sve2048_f64
#define xfrfrexpf nsimd_sleef_frfrexp_sve2048_f32
#define xexpfrexp nsimd_sleef_expfrexp_sve2048_f64
#define xexpfrexpf nsimd_sleef_expfrexp_sve2048_f32
#define xfmod nsimd_sleef_fmod_sve2048_f64
#define xfmodf nsimd_sleef_fmod_sve2048_f32
#define xremainder nsimd_sleef_remainder_sve2048_f64
#define xremainderf nsimd_sleef_remainder_sve2048_f32
#define xmodf nsimd_sleef_modf_sve2048_f64
#define xmodff nsimd_sleef_modf_sve2048_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve2048_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve2048_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve2048_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve2048_f32
#define xerf_u1 nsimd_sleef_erf_u10_sve2048_f64
#define xerff_u1 nsimd_sleef_erf_u10_sve2048_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_sve2048_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_sve2048_f32
#define xgetInt nsimd_sleef_getInt_sve2048_f64
#define xgetIntf nsimd_sleef_getInt_sve2048_f32
#define xgetPtr nsimd_sleef_getPtr_sve2048_f64
#define xgetPtrf nsimd_sleef_getPtr_sve2048_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_sve2048
                   #define rempif nsimd_sleef_rempif_sve2048
                   #define rempisub nsimd_sleef_rempisub_sve2048
                   #define rempisubf nsimd_sleef_rempisubf_sve2048
                   #define gammak nsimd_gammak_sve2048
                   #define gammafk nsimd_gammafk_sve2048

                   #endif

                   
#endif


================================================
FILE: src/renamevsx.h
================================================
#ifndef RENAMEVSX_H
               #define RENAMEVSX_H

               /* ------------------------------------------------------------------------- */
                   /* Naming of functions vmx */

                   #ifdef NSIMD_VMX

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_vmx_f64
#define xsinf nsimd_sleef_sin_u35d_vmx_f32
#define xcos nsimd_sleef_cos_u35d_vmx_f64
#define xcosf nsimd_sleef_cos_u35d_vmx_f32
#define xsincos nsimd_sleef_sincos_u35d_vmx_f64
#define xsincosf nsimd_sleef_sincos_u35d_vmx_f32
#define xtan nsimd_sleef_tan_u35d_vmx_f64
#define xtanf nsimd_sleef_tan_u35d_vmx_f32
#define xasin nsimd_sleef_asin_u35d_vmx_f64
#define xasinf nsimd_sleef_asin_u35d_vmx_f32
#define xacos nsimd_sleef_acos_u35d_vmx_f64
#define xacosf nsimd_sleef_acos_u35d_vmx_f32
#define xatan nsimd_sleef_atan_u35d_vmx_f64
#define xatanf nsimd_sleef_atan_u35d_vmx_f32
#define xatan2 nsimd_sleef_atan2_u35d_vmx_f64
#define xatan2f nsimd_sleef_atan2_u35d_vmx_f32
#define xlog nsimd_sleef_log_u35d_vmx_f64
#define xlogf nsimd_sleef_log_u35d_vmx_f32
#define xcbrt nsimd_sleef_cbrt_u35d_vmx_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_vmx_f32
#define xsin_u1 nsimd_sleef_sin_u10d_vmx_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_vmx_f32
#define xcos_u1 nsimd_sleef_cos_u10d_vmx_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_vmx_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_vmx_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_vmx_f32
#define xtan_u1 nsimd_sleef_tan_u10d_vmx_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_vmx_f32
#define xasin_u1 nsimd_sleef_asin_u10d_vmx_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_vmx_f32
#define xacos_u1 nsimd_sleef_acos_u10d_vmx_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_vmx_f32
#define xatan_u1 nsimd_sleef_atan_u10d_vmx_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_vmx_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_vmx_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_vmx_f32
#define xlog_u1 nsimd_sleef_log_u10d_vmx_f64
#define xlogf_u1 nsimd_sleef_log_u10d_vmx_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_vmx_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_vmx_f32
#define xexp nsimd_sleef_exp_u10d_vmx_f64
#define xexpf nsimd_sleef_exp_u10d_vmx_f32
#define xpow nsimd_sleef_pow_u10d_vmx_f64
#define xpowf nsimd_sleef_pow_u10d_vmx_f32
#define xsinh nsimd_sleef_sinh_u10d_vmx_f64
#define xsinhf nsimd_sleef_sinh_u10d_vmx_f32
#define xcosh nsimd_sleef_cosh_u10d_vmx_f64
#define xcoshf nsimd_sleef_cosh_u10d_vmx_f32
#define xtanh nsimd_sleef_tanh_u10d_vmx_f64
#define xtanhf nsimd_sleef_tanh_u10d_vmx_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_vmx_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_vmx_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_vmx_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_vmx_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_vmx_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_vmx_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_vmx_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_vmx_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_vmx_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_vmx_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_vmx_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_vmx_f32
#define xasinh nsimd_sleef_asinh_u10d_vmx_f64
#define xasinhf nsimd_sleef_asinh_u10d_vmx_f32
#define xacosh nsimd_sleef_acosh_u10d_vmx_f64
#define xacoshf nsimd_sleef_acosh_u10d_vmx_f32
#define xatanh nsimd_sleef_atanh_u10d_vmx_f64
#define xatanhf nsimd_sleef_atanh_u10d_vmx_f32
#define xexp2 nsimd_sleef_exp2_u10d_vmx_f64
#define xexp2f nsimd_sleef_exp2_u10d_vmx_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_vmx_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_vmx_f32
#define xexp10 nsimd_sleef_exp10_u10d_vmx_f64
#define xexp10f nsimd_sleef_exp10_u10d_vmx_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_vmx_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_vmx_f32
#define xexpm1 nsimd_sleef_expm1_u10d_vmx_f64
#define xexpm1f nsimd_sleef_expm1_u10d_vmx_f32
#define xlog10 nsimd_sleef_log10_u10d_vmx_f64
#define xlog10f nsimd_sleef_log10_u10d_vmx_f32
#define xlog2 nsimd_sleef_log2_u10d_vmx_f64
#define xlog2f nsimd_sleef_log2_u10d_vmx_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_vmx_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_vmx_f32
#define xlog1p nsimd_sleef_log1p_u10d_vmx_f64
#define xlog1pf nsimd_sleef_log1p_u10d_vmx_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_vmx_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_vmx_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_vmx_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_vmx_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_vmx_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_vmx_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_vmx_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_vmx_f32
#define xldexp nsimd_sleef_ldexp_vmx_f64
#define xldexpf nsimd_sleef_ldexp_vmx_f32
#define xilogb nsimd_sleef_ilogb_vmx_f64
#define xilogbf nsimd_sleef_ilogb_vmx_f32
#define xfma nsimd_sleef_fma_vmx_f64
#define xfmaf nsimd_sleef_fma_vmx_f32
#define xsqrt nsimd_sleef_sqrt_vmx_f64
#define xsqrtf nsimd_sleef_sqrt_vmx_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_vmx_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_vmx_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_vmx_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_vmx_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_vmx_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_vmx_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_vmx_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_vmx_f32
#define xfabs nsimd_sleef_fabs_vmx_f64
#define xfabsf nsimd_sleef_fabs_vmx_f32
#define xcopysign nsimd_sleef_copysign_vmx_f64
#define xcopysignf nsimd_sleef_copysign_vmx_f32
#define xfmax nsimd_sleef_fmax_vmx_f64
#define xfmaxf nsimd_sleef_fmax_vmx_f32
#define xfmin nsimd_sleef_fmin_vmx_f64
#define xfminf nsimd_sleef_fmin_vmx_f32
#define xfdim nsimd_sleef_fdim_vmx_f64
#define xfdimf nsimd_sleef_fdim_vmx_f32
#define xtrunc nsimd_sleef_trunc_vmx_f64
#define xtruncf nsimd_sleef_trunc_vmx_f32
#define xfloor nsimd_sleef_floor_vmx_f64
#define xfloorf nsimd_sleef_floor_vmx_f32
#define xceil nsimd_sleef_ceil_vmx_f64
#define xceilf nsimd_sleef_ceil_vmx_f32
#define xround nsimd_sleef_round_vmx_f64
#define xroundf nsimd_sleef_round_vmx_f32
#define xrint nsimd_sleef_rint_vmx_f64
#define xrintf nsimd_sleef_rint_vmx_f32
#define xnextafter nsimd_sleef_nextafter_vmx_f64
#define xnextafterf nsimd_sleef_nextafter_vmx_f32
#define xfrfrexp nsimd_sleef_frfrexp_vmx_f64
#define xfrfrexpf nsimd_sleef_frfrexp_vmx_f32
#define xexpfrexp nsimd_sleef_expfrexp_vmx_f64
#define xexpfrexpf nsimd_sleef_expfrexp_vmx_f32
#define xfmod nsimd_sleef_fmod_vmx_f64
#define xfmodf nsimd_sleef_fmod_vmx_f32
#define xremainder nsimd_sleef_remainder_vmx_f64
#define xremainderf nsimd_sleef_remainder_vmx_f32
#define xmodf nsimd_sleef_modf_vmx_f64
#define xmodff nsimd_sleef_modf_vmx_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_vmx_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_vmx_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_vmx_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_vmx_f32
#define xerf_u1 nsimd_sleef_erf_u10d_vmx_f64
#define xerff_u1 nsimd_sleef_erf_u10d_vmx_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_vmx_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_vmx_f32
#define xgetInt nsimd_sleef_getInt_vmx_f64
#define xgetIntf nsimd_sleef_getInt_vmx_f32
#define xgetPtr nsimd_sleef_getPtr_vmx_f64
#define xgetPtrf nsimd_sleef_getPtr_vmx_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_vmx_f64
#define xsinf nsimd_sleef_sin_u35_vmx_f32
#define xcos nsimd_sleef_cos_u35_vmx_f64
#define xcosf nsimd_sleef_cos_u35_vmx_f32
#define xsincos nsimd_sleef_sincos_u35_vmx_f64
#define xsincosf nsimd_sleef_sincos_u35_vmx_f32
#define xtan nsimd_sleef_tan_u35_vmx_f64
#define xtanf nsimd_sleef_tan_u35_vmx_f32
#define xasin nsimd_sleef_asin_u35_vmx_f64
#define xasinf nsimd_sleef_asin_u35_vmx_f32
#define xacos nsimd_sleef_acos_u35_vmx_f64
#define xacosf nsimd_sleef_acos_u35_vmx_f32
#define xatan nsimd_sleef_atan_u35_vmx_f64
#define xatanf nsimd_sleef_atan_u35_vmx_f32
#define xatan2 nsimd_sleef_atan2_u35_vmx_f64
#define xatan2f nsimd_sleef_atan2_u35_vmx_f32
#define xlog nsimd_sleef_log_u35_vmx_f64
#define xlogf nsimd_sleef_log_u35_vmx_f32
#define xcbrt nsimd_sleef_cbrt_u35_vmx_f64
#define xcbrtf nsimd_sleef_cbrt_u35_vmx_f32
#define xsin_u1 nsimd_sleef_sin_u10_vmx_f64
#define xsinf_u1 nsimd_sleef_sin_u10_vmx_f32
#define xcos_u1 nsimd_sleef_cos_u10_vmx_f64
#define xcosf_u1 nsimd_sleef_cos_u10_vmx_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_vmx_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_vmx_f32
#define xtan_u1 nsimd_sleef_tan_u10_vmx_f64
#define xtanf_u1 nsimd_sleef_tan_u10_vmx_f32
#define xasin_u1 nsimd_sleef_asin_u10_vmx_f64
#define xasinf_u1 nsimd_sleef_asin_u10_vmx_f32
#define xacos_u1 nsimd_sleef_acos_u10_vmx_f64
#define xacosf_u1 nsimd_sleef_acos_u10_vmx_f32
#define xatan_u1 nsimd_sleef_atan_u10_vmx_f64
#define xatanf_u1 nsimd_sleef_atan_u10_vmx_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_vmx_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_vmx_f32
#define xlog_u1 nsimd_sleef_log_u10_vmx_f64
#define xlogf_u1 nsimd_sleef_log_u10_vmx_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_vmx_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_vmx_f32
#define xexp nsimd_sleef_exp_u10_vmx_f64
#define xexpf nsimd_sleef_exp_u10_vmx_f32
#define xpow nsimd_sleef_pow_u10_vmx_f64
#define xpowf nsimd_sleef_pow_u10_vmx_f32
#define xsinh nsimd_sleef_sinh_u10_vmx_f64
#define xsinhf nsimd_sleef_sinh_u10_vmx_f32
#define xcosh nsimd_sleef_cosh_u10_vmx_f64
#define xcoshf nsimd_sleef_cosh_u10_vmx_f32
#define xtanh nsimd_sleef_tanh_u10_vmx_f64
#define xtanhf nsimd_sleef_tanh_u10_vmx_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_vmx_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_vmx_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_vmx_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_vmx_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_vmx_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_vmx_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_vmx_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_vmx_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_vmx_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_vmx_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_vmx_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_vmx_f32
#define xasinh nsimd_sleef_asinh_u10_vmx_f64
#define xasinhf nsimd_sleef_asinh_u10_vmx_f32
#define xacosh nsimd_sleef_acosh_u10_vmx_f64
#define xacoshf nsimd_sleef_acosh_u10_vmx_f32
#define xatanh nsimd_sleef_atanh_u10_vmx_f64
#define xatanhf nsimd_sleef_atanh_u10_vmx_f32
#define xexp2 nsimd_sleef_exp2_u10_vmx_f64
#define xexp2f nsimd_sleef_exp2_u10_vmx_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_vmx_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_vmx_f32
#define xexp10 nsimd_sleef_exp10_u10_vmx_f64
#define xexp10f nsimd_sleef_exp10_u10_vmx_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_vmx_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_vmx_f32
#define xexpm1 nsimd_sleef_expm1_u10_vmx_f64
#define xexpm1f nsimd_sleef_expm1_u10_vmx_f32
#define xlog10 nsimd_sleef_log10_u10_vmx_f64
#define xlog10f nsimd_sleef_log10_u10_vmx_f32
#define xlog2 nsimd_sleef_log2_u10_vmx_f64
#define xlog2f nsimd_sleef_log2_u10_vmx_f32
#define xlog2_u35 nsimd_sleef_log2_u35_vmx_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_vmx_f32
#define xlog1p nsimd_sleef_log1p_u10_vmx_f64
#define xlog1pf nsimd_sleef_log1p_u10_vmx_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_vmx_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_vmx_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_vmx_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_vmx_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_vmx_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_vmx_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_vmx_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_vmx_f32
#define xldexp nsimd_sleef_ldexp_vmx_f64
#define xldexpf nsimd_sleef_ldexp_vmx_f32
#define xilogb nsimd_sleef_ilogb_vmx_f64
#define xilogbf nsimd_sleef_ilogb_vmx_f32
#define xfma nsimd_sleef_fma_vmx_f64
#define xfmaf nsimd_sleef_fma_vmx_f32
#define xsqrt nsimd_sleef_sqrt_vmx_f64
#define xsqrtf nsimd_sleef_sqrt_vmx_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_vmx_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_vmx_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_vmx_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_vmx_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_vmx_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_vmx_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_vmx_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_vmx_f32
#define xfabs nsimd_sleef_fabs_vmx_f64
#define xfabsf nsimd_sleef_fabs_vmx_f32
#define xcopysign nsimd_sleef_copysign_vmx_f64
#define xcopysignf nsimd_sleef_copysign_vmx_f32
#define xfmax nsimd_sleef_fmax_vmx_f64
#define xfmaxf nsimd_sleef_fmax_vmx_f32
#define xfmin nsimd_sleef_fmin_vmx_f64
#define xfminf nsimd_sleef_fmin_vmx_f32
#define xfdim nsimd_sleef_fdim_vmx_f64
#define xfdimf nsimd_sleef_fdim_vmx_f32
#define xtrunc nsimd_sleef_trunc_vmx_f64
#define xtruncf nsimd_sleef_trunc_vmx_f32
#define xfloor nsimd_sleef_floor_vmx_f64
#define xfloorf nsimd_sleef_floor_vmx_f32
#define xceil nsimd_sleef_ceil_vmx_f64
#define xceilf nsimd_sleef_ceil_vmx_f32
#define xround nsimd_sleef_round_vmx_f64
#define xroundf nsimd_sleef_round_vmx_f32
#define xrint nsimd_sleef_rint_vmx_f64
#define xrintf nsimd_sleef_rint_vmx_f32
#define xnextafter nsimd_sleef_nextafter_vmx_f64
#define xnextafterf nsimd_sleef_nextafter_vmx_f32
#define xfrfrexp nsimd_sleef_frfrexp_vmx_f64
#define xfrfrexpf nsimd_sleef_frfrexp_vmx_f32
#define xexpfrexp nsimd_sleef_expfrexp_vmx_f64
#define xexpfrexpf nsimd_sleef_expfrexp_vmx_f32
#define xfmod nsimd_sleef_fmod_vmx_f64
#define xfmodf nsimd_sleef_fmod_vmx_f32
#define xremainder nsimd_sleef_remainder_vmx_f64
#define xremainderf nsimd_sleef_remainder_vmx_f32
#define xmodf nsimd_sleef_modf_vmx_f64
#define xmodff nsimd_sleef_modf_vmx_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_vmx_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_vmx_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_vmx_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_vmx_f32
#define xerf_u1 nsimd_sleef_erf_u10_vmx_f64
#define xerff_u1 nsimd_sleef_erf_u10_vmx_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_vmx_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_vmx_f32
#define xgetInt nsimd_sleef_getInt_vmx_f64
#define xgetIntf nsimd_sleef_getInt_vmx_f32
#define xgetPtr nsimd_sleef_getPtr_vmx_f64
#define xgetPtrf nsimd_sleef_getPtr_vmx_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_vmx
                   #define rempif nsimd_sleef_rempif_vmx
                   #define rempisub nsimd_sleef_rempisub_vmx
                   #define rempisubf nsimd_sleef_rempisubf_vmx
                   #define gammak nsimd_gammak_vmx
                   #define gammafk nsimd_gammafk_vmx

                   #endif

                   /* ------------------------------------------------------------------------- */
                   /* Naming of functions vsx */

                   #ifdef NSIMD_VSX

                   #ifdef DETERMINISTIC

                   #define xsin nsimd_sleef_sin_u35d_vsx_f64
#define xsinf nsimd_sleef_sin_u35d_vsx_f32
#define xcos nsimd_sleef_cos_u35d_vsx_f64
#define xcosf nsimd_sleef_cos_u35d_vsx_f32
#define xsincos nsimd_sleef_sincos_u35d_vsx_f64
#define xsincosf nsimd_sleef_sincos_u35d_vsx_f32
#define xtan nsimd_sleef_tan_u35d_vsx_f64
#define xtanf nsimd_sleef_tan_u35d_vsx_f32
#define xasin nsimd_sleef_asin_u35d_vsx_f64
#define xasinf nsimd_sleef_asin_u35d_vsx_f32
#define xacos nsimd_sleef_acos_u35d_vsx_f64
#define xacosf nsimd_sleef_acos_u35d_vsx_f32
#define xatan nsimd_sleef_atan_u35d_vsx_f64
#define xatanf nsimd_sleef_atan_u35d_vsx_f32
#define xatan2 nsimd_sleef_atan2_u35d_vsx_f64
#define xatan2f nsimd_sleef_atan2_u35d_vsx_f32
#define xlog nsimd_sleef_log_u35d_vsx_f64
#define xlogf nsimd_sleef_log_u35d_vsx_f32
#define xcbrt nsimd_sleef_cbrt_u35d_vsx_f64
#define xcbrtf nsimd_sleef_cbrt_u35d_vsx_f32
#define xsin_u1 nsimd_sleef_sin_u10d_vsx_f64
#define xsinf_u1 nsimd_sleef_sin_u10d_vsx_f32
#define xcos_u1 nsimd_sleef_cos_u10d_vsx_f64
#define xcosf_u1 nsimd_sleef_cos_u10d_vsx_f32
#define xsincos_u1 nsimd_sleef_sincos_u10d_vsx_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10d_vsx_f32
#define xtan_u1 nsimd_sleef_tan_u10d_vsx_f64
#define xtanf_u1 nsimd_sleef_tan_u10d_vsx_f32
#define xasin_u1 nsimd_sleef_asin_u10d_vsx_f64
#define xasinf_u1 nsimd_sleef_asin_u10d_vsx_f32
#define xacos_u1 nsimd_sleef_acos_u10d_vsx_f64
#define xacosf_u1 nsimd_sleef_acos_u10d_vsx_f32
#define xatan_u1 nsimd_sleef_atan_u10d_vsx_f64
#define xatanf_u1 nsimd_sleef_atan_u10d_vsx_f32
#define xatan2_u1 nsimd_sleef_atan2_u10d_vsx_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10d_vsx_f32
#define xlog_u1 nsimd_sleef_log_u10d_vsx_f64
#define xlogf_u1 nsimd_sleef_log_u10d_vsx_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10d_vsx_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_vsx_f32
#define xexp nsimd_sleef_exp_u10d_vsx_f64
#define xexpf nsimd_sleef_exp_u10d_vsx_f32
#define xpow nsimd_sleef_pow_u10d_vsx_f64
#define xpowf nsimd_sleef_pow_u10d_vsx_f32
#define xsinh nsimd_sleef_sinh_u10d_vsx_f64
#define xsinhf nsimd_sleef_sinh_u10d_vsx_f32
#define xcosh nsimd_sleef_cosh_u10d_vsx_f64
#define xcoshf nsimd_sleef_cosh_u10d_vsx_f32
#define xtanh nsimd_sleef_tanh_u10d_vsx_f64
#define xtanhf nsimd_sleef_tanh_u10d_vsx_f32
#define xsinh_u35 nsimd_sleef_sinh_u35d_vsx_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35d_vsx_f32
#define xcosh_u35 nsimd_sleef_cosh_u35d_vsx_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35d_vsx_f32
#define xtanh_u35 nsimd_sleef_tanh_u35d_vsx_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35d_vsx_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_vsx_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_vsx_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_vsx_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_vsx_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_vsx_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_vsx_f32
#define xasinh nsimd_sleef_asinh_u10d_vsx_f64
#define xasinhf nsimd_sleef_asinh_u10d_vsx_f32
#define xacosh nsimd_sleef_acosh_u10d_vsx_f64
#define xacoshf nsimd_sleef_acosh_u10d_vsx_f32
#define xatanh nsimd_sleef_atanh_u10d_vsx_f64
#define xatanhf nsimd_sleef_atanh_u10d_vsx_f32
#define xexp2 nsimd_sleef_exp2_u10d_vsx_f64
#define xexp2f nsimd_sleef_exp2_u10d_vsx_f32
#define xexp2_u35 nsimd_sleef_exp2_u35d_vsx_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35d_vsx_f32
#define xexp10 nsimd_sleef_exp10_u10d_vsx_f64
#define xexp10f nsimd_sleef_exp10_u10d_vsx_f32
#define xexp10_u35 nsimd_sleef_exp10_u35d_vsx_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35d_vsx_f32
#define xexpm1 nsimd_sleef_expm1_u10d_vsx_f64
#define xexpm1f nsimd_sleef_expm1_u10d_vsx_f32
#define xlog10 nsimd_sleef_log10_u10d_vsx_f64
#define xlog10f nsimd_sleef_log10_u10d_vsx_f32
#define xlog2 nsimd_sleef_log2_u10d_vsx_f64
#define xlog2f nsimd_sleef_log2_u10d_vsx_f32
#define xlog2_u35 nsimd_sleef_log2_u35d_vsx_f64
#define xlog2f_u35 nsimd_sleef_log2_u35d_vsx_f32
#define xlog1p nsimd_sleef_log1p_u10d_vsx_f64
#define xlog1pf nsimd_sleef_log1p_u10d_vsx_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05d_vsx_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05d_vsx_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35d_vsx_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35d_vsx_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05d_vsx_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05d_vsx_f32
#define xcospi_u05 nsimd_sleef_cospi_u05d_vsx_f64
#define xcospif_u05 nsimd_sleef_cospi_u05d_vsx_f32
#define xldexp nsimd_sleef_ldexp_vsx_f64
#define xldexpf nsimd_sleef_ldexp_vsx_f32
#define xilogb nsimd_sleef_ilogb_vsx_f64
#define xilogbf nsimd_sleef_ilogb_vsx_f32
#define xfma nsimd_sleef_fma_vsx_f64
#define xfmaf nsimd_sleef_fma_vsx_f32
#define xsqrt nsimd_sleef_sqrt_vsx_f64
#define xsqrtf nsimd_sleef_sqrt_vsx_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05d_vsx_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_vsx_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35d_vsx_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_vsx_f32
#define xhypot_u05 nsimd_sleef_hypot_u05d_vsx_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05d_vsx_f32
#define xhypot_u35 nsimd_sleef_hypot_u35d_vsx_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35d_vsx_f32
#define xfabs nsimd_sleef_fabs_vsx_f64
#define xfabsf nsimd_sleef_fabs_vsx_f32
#define xcopysign nsimd_sleef_copysign_vsx_f64
#define xcopysignf nsimd_sleef_copysign_vsx_f32
#define xfmax nsimd_sleef_fmax_vsx_f64
#define xfmaxf nsimd_sleef_fmax_vsx_f32
#define xfmin nsimd_sleef_fmin_vsx_f64
#define xfminf nsimd_sleef_fmin_vsx_f32
#define xfdim nsimd_sleef_fdim_vsx_f64
#define xfdimf nsimd_sleef_fdim_vsx_f32
#define xtrunc nsimd_sleef_trunc_vsx_f64
#define xtruncf nsimd_sleef_trunc_vsx_f32
#define xfloor nsimd_sleef_floor_vsx_f64
#define xfloorf nsimd_sleef_floor_vsx_f32
#define xceil nsimd_sleef_ceil_vsx_f64
#define xceilf nsimd_sleef_ceil_vsx_f32
#define xround nsimd_sleef_round_vsx_f64
#define xroundf nsimd_sleef_round_vsx_f32
#define xrint nsimd_sleef_rint_vsx_f64
#define xrintf nsimd_sleef_rint_vsx_f32
#define xnextafter nsimd_sleef_nextafter_vsx_f64
#define xnextafterf nsimd_sleef_nextafter_vsx_f32
#define xfrfrexp nsimd_sleef_frfrexp_vsx_f64
#define xfrfrexpf nsimd_sleef_frfrexp_vsx_f32
#define xexpfrexp nsimd_sleef_expfrexp_vsx_f64
#define xexpfrexpf nsimd_sleef_expfrexp_vsx_f32
#define xfmod nsimd_sleef_fmod_vsx_f64
#define xfmodf nsimd_sleef_fmod_vsx_f32
#define xremainder nsimd_sleef_remainder_vsx_f64
#define xremainderf nsimd_sleef_remainder_vsx_f32
#define xmodf nsimd_sleef_modf_vsx_f64
#define xmodff nsimd_sleef_modf_vsx_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10d_vsx_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_vsx_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10d_vsx_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_vsx_f32
#define xerf_u1 nsimd_sleef_erf_u10d_vsx_f64
#define xerff_u1 nsimd_sleef_erf_u10d_vsx_f32
#define xerfc_u15 nsimd_sleef_erfc_u15d_vsx_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15d_vsx_f32
#define xgetInt nsimd_sleef_getInt_vsx_f64
#define xgetIntf nsimd_sleef_getInt_vsx_f32
#define xgetPtr nsimd_sleef_getPtr_vsx_f64
#define xgetPtrf nsimd_sleef_getPtr_vsx_f32

                   #else

                   #define xsin nsimd_sleef_sin_u35_vsx_f64
#define xsinf nsimd_sleef_sin_u35_vsx_f32
#define xcos nsimd_sleef_cos_u35_vsx_f64
#define xcosf nsimd_sleef_cos_u35_vsx_f32
#define xsincos nsimd_sleef_sincos_u35_vsx_f64
#define xsincosf nsimd_sleef_sincos_u35_vsx_f32
#define xtan nsimd_sleef_tan_u35_vsx_f64
#define xtanf nsimd_sleef_tan_u35_vsx_f32
#define xasin nsimd_sleef_asin_u35_vsx_f64
#define xasinf nsimd_sleef_asin_u35_vsx_f32
#define xacos nsimd_sleef_acos_u35_vsx_f64
#define xacosf nsimd_sleef_acos_u35_vsx_f32
#define xatan nsimd_sleef_atan_u35_vsx_f64
#define xatanf nsimd_sleef_atan_u35_vsx_f32
#define xatan2 nsimd_sleef_atan2_u35_vsx_f64
#define xatan2f nsimd_sleef_atan2_u35_vsx_f32
#define xlog nsimd_sleef_log_u35_vsx_f64
#define xlogf nsimd_sleef_log_u35_vsx_f32
#define xcbrt nsimd_sleef_cbrt_u35_vsx_f64
#define xcbrtf nsimd_sleef_cbrt_u35_vsx_f32
#define xsin_u1 nsimd_sleef_sin_u10_vsx_f64
#define xsinf_u1 nsimd_sleef_sin_u10_vsx_f32
#define xcos_u1 nsimd_sleef_cos_u10_vsx_f64
#define xcosf_u1 nsimd_sleef_cos_u10_vsx_f32
#define xsincos_u1 nsimd_sleef_sincos_u10_vsx_f64
#define xsincosf_u1 nsimd_sleef_sincos_u10_vsx_f32
#define xtan_u1 nsimd_sleef_tan_u10_vsx_f64
#define xtanf_u1 nsimd_sleef_tan_u10_vsx_f32
#define xasin_u1 nsimd_sleef_asin_u10_vsx_f64
#define xasinf_u1 nsimd_sleef_asin_u10_vsx_f32
#define xacos_u1 nsimd_sleef_acos_u10_vsx_f64
#define xacosf_u1 nsimd_sleef_acos_u10_vsx_f32
#define xatan_u1 nsimd_sleef_atan_u10_vsx_f64
#define xatanf_u1 nsimd_sleef_atan_u10_vsx_f32
#define xatan2_u1 nsimd_sleef_atan2_u10_vsx_f64
#define xatan2f_u1 nsimd_sleef_atan2_u10_vsx_f32
#define xlog_u1 nsimd_sleef_log_u10_vsx_f64
#define xlogf_u1 nsimd_sleef_log_u10_vsx_f32
#define xcbrt_u1 nsimd_sleef_cbrt_u10_vsx_f64
#define xcbrtf_u1 nsimd_sleef_cbrt_u10_vsx_f32
#define xexp nsimd_sleef_exp_u10_vsx_f64
#define xexpf nsimd_sleef_exp_u10_vsx_f32
#define xpow nsimd_sleef_pow_u10_vsx_f64
#define xpowf nsimd_sleef_pow_u10_vsx_f32
#define xsinh nsimd_sleef_sinh_u10_vsx_f64
#define xsinhf nsimd_sleef_sinh_u10_vsx_f32
#define xcosh nsimd_sleef_cosh_u10_vsx_f64
#define xcoshf nsimd_sleef_cosh_u10_vsx_f32
#define xtanh nsimd_sleef_tanh_u10_vsx_f64
#define xtanhf nsimd_sleef_tanh_u10_vsx_f32
#define xsinh_u35 nsimd_sleef_sinh_u35_vsx_f64
#define xsinhf_u35 nsimd_sleef_sinh_u35_vsx_f32
#define xcosh_u35 nsimd_sleef_cosh_u35_vsx_f64
#define xcoshf_u35 nsimd_sleef_cosh_u35_vsx_f32
#define xtanh_u35 nsimd_sleef_tanh_u35_vsx_f64
#define xtanhf_u35 nsimd_sleef_tanh_u35_vsx_f32
#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_vsx_f64
#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_vsx_f32
#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_vsx_f64
#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_vsx_f32
#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_vsx_f64
#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_vsx_f32
#define xasinh nsimd_sleef_asinh_u10_vsx_f64
#define xasinhf nsimd_sleef_asinh_u10_vsx_f32
#define xacosh nsimd_sleef_acosh_u10_vsx_f64
#define xacoshf nsimd_sleef_acosh_u10_vsx_f32
#define xatanh nsimd_sleef_atanh_u10_vsx_f64
#define xatanhf nsimd_sleef_atanh_u10_vsx_f32
#define xexp2 nsimd_sleef_exp2_u10_vsx_f64
#define xexp2f nsimd_sleef_exp2_u10_vsx_f32
#define xexp2_u35 nsimd_sleef_exp2_u35_vsx_f64
#define xexp2f_u35 nsimd_sleef_exp2_u35_vsx_f32
#define xexp10 nsimd_sleef_exp10_u10_vsx_f64
#define xexp10f nsimd_sleef_exp10_u10_vsx_f32
#define xexp10_u35 nsimd_sleef_exp10_u35_vsx_f64
#define xexp10f_u35 nsimd_sleef_exp10_u35_vsx_f32
#define xexpm1 nsimd_sleef_expm1_u10_vsx_f64
#define xexpm1f nsimd_sleef_expm1_u10_vsx_f32
#define xlog10 nsimd_sleef_log10_u10_vsx_f64
#define xlog10f nsimd_sleef_log10_u10_vsx_f32
#define xlog2 nsimd_sleef_log2_u10_vsx_f64
#define xlog2f nsimd_sleef_log2_u10_vsx_f32
#define xlog2_u35 nsimd_sleef_log2_u35_vsx_f64
#define xlog2f_u35 nsimd_sleef_log2_u35_vsx_f32
#define xlog1p nsimd_sleef_log1p_u10_vsx_f64
#define xlog1pf nsimd_sleef_log1p_u10_vsx_f32
#define xsincospi_u05 nsimd_sleef_sincospi_u05_vsx_f64
#define xsincospif_u05 nsimd_sleef_sincospi_u05_vsx_f32
#define xsincospi_u35 nsimd_sleef_sincospi_u35_vsx_f64
#define xsincospif_u35 nsimd_sleef_sincospi_u35_vsx_f32
#define xsinpi_u05 nsimd_sleef_sinpi_u05_vsx_f64
#define xsinpif_u05 nsimd_sleef_sinpi_u05_vsx_f32
#define xcospi_u05 nsimd_sleef_cospi_u05_vsx_f64
#define xcospif_u05 nsimd_sleef_cospi_u05_vsx_f32
#define xldexp nsimd_sleef_ldexp_vsx_f64
#define xldexpf nsimd_sleef_ldexp_vsx_f32
#define xilogb nsimd_sleef_ilogb_vsx_f64
#define xilogbf nsimd_sleef_ilogb_vsx_f32
#define xfma nsimd_sleef_fma_vsx_f64
#define xfmaf nsimd_sleef_fma_vsx_f32
#define xsqrt nsimd_sleef_sqrt_vsx_f64
#define xsqrtf nsimd_sleef_sqrt_vsx_f32
#define xsqrt_u05 nsimd_sleef_sqrt_u05_vsx_f64
#define xsqrtf_u05 nsimd_sleef_sqrt_u05_vsx_f32
#define xsqrt_u35 nsimd_sleef_sqrt_u35_vsx_f64
#define xsqrtf_u35 nsimd_sleef_sqrt_u35_vsx_f32
#define xhypot_u05 nsimd_sleef_hypot_u05_vsx_f64
#define xhypotf_u05 nsimd_sleef_hypot_u05_vsx_f32
#define xhypot_u35 nsimd_sleef_hypot_u35_vsx_f64
#define xhypotf_u35 nsimd_sleef_hypot_u35_vsx_f32
#define xfabs nsimd_sleef_fabs_vsx_f64
#define xfabsf nsimd_sleef_fabs_vsx_f32
#define xcopysign nsimd_sleef_copysign_vsx_f64
#define xcopysignf nsimd_sleef_copysign_vsx_f32
#define xfmax nsimd_sleef_fmax_vsx_f64
#define xfmaxf nsimd_sleef_fmax_vsx_f32
#define xfmin nsimd_sleef_fmin_vsx_f64
#define xfminf nsimd_sleef_fmin_vsx_f32
#define xfdim nsimd_sleef_fdim_vsx_f64
#define xfdimf nsimd_sleef_fdim_vsx_f32
#define xtrunc nsimd_sleef_trunc_vsx_f64
#define xtruncf nsimd_sleef_trunc_vsx_f32
#define xfloor nsimd_sleef_floor_vsx_f64
#define xfloorf nsimd_sleef_floor_vsx_f32
#define xceil nsimd_sleef_ceil_vsx_f64
#define xceilf nsimd_sleef_ceil_vsx_f32
#define xround nsimd_sleef_round_vsx_f64
#define xroundf nsimd_sleef_round_vsx_f32
#define xrint nsimd_sleef_rint_vsx_f64
#define xrintf nsimd_sleef_rint_vsx_f32
#define xnextafter nsimd_sleef_nextafter_vsx_f64
#define xnextafterf nsimd_sleef_nextafter_vsx_f32
#define xfrfrexp nsimd_sleef_frfrexp_vsx_f64
#define xfrfrexpf nsimd_sleef_frfrexp_vsx_f32
#define xexpfrexp nsimd_sleef_expfrexp_vsx_f64
#define xexpfrexpf nsimd_sleef_expfrexp_vsx_f32
#define xfmod nsimd_sleef_fmod_vsx_f64
#define xfmodf nsimd_sleef_fmod_vsx_f32
#define xremainder nsimd_sleef_remainder_vsx_f64
#define xremainderf nsimd_sleef_remainder_vsx_f32
#define xmodf nsimd_sleef_modf_vsx_f64
#define xmodff nsimd_sleef_modf_vsx_f32
#define xlgamma_u1 nsimd_sleef_lgamma_u10_vsx_f64
#define xlgammaf_u1 nsimd_sleef_lgamma_u10_vsx_f32
#define xtgamma_u1 nsimd_sleef_tgamma_u10_vsx_f64
#define xtgammaf_u1 nsimd_sleef_tgamma_u10_vsx_f32
#define xerf_u1 nsimd_sleef_erf_u10_vsx_f64
#define xerff_u1 nsimd_sleef_erf_u10_vsx_f32
#define xerfc_u15 nsimd_sleef_erfc_u15_vsx_f64
#define xerfcf_u15 nsimd_sleef_erfc_u15_vsx_f32
#define xgetInt nsimd_sleef_getInt_vsx_f64
#define xgetIntf nsimd_sleef_getInt_vsx_f32
#define xgetPtr nsimd_sleef_getPtr_vsx_f64
#define xgetPtrf nsimd_sleef_getPtr_vsx_f32

                   #endif

                   #define rempi nsimd_sleef_rempi_vsx
                   #define rempif nsimd_sleef_rempif_vsx
                   #define rempisub nsimd_sleef_rempisub_vsx
                   #define rempisubf nsimd_sleef_rempisubf_vsx
                   #define gammak nsimd_gammak_vsx
                   #define gammafk nsimd_gammafk_vsx

                   #endif

                   
#endif


================================================
FILE: src/sleefdp.c
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

// Always use -ffp-contract=off option to compile SLEEF.

#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <limits.h>
#include <float.h>

#ifndef ENABLE_BUILTIN_MATH
#include <math.h>
#define SQRT sqrt
#else
#define SQRT __builtin_sqrt
#endif

#include "misc.h"

extern const double Sleef_rempitabdp[];

#ifdef DORENAME
#include "rename.h"
#endif

#if (defined(_MSC_VER))
#pragma fp_contract (off)
#endif

#define MLA mla
#define C2V(x) (x)
#include "estrin.h"

static INLINE CONST int64_t doubleToRawLongBits(double d) {
  union {
    double f;
    int64_t i;
  } tmp;
  tmp.f = d;
  return tmp.i;
}

static INLINE CONST double longBitsToDouble(int64_t i) {
  union {
    double f;
    int64_t i;
  } tmp;
  tmp.i = i;
  return tmp.f;
}

static INLINE CONST double fabsk(double x) {
  return longBitsToDouble(INT64_C(0x7fffffffffffffff) & doubleToRawLongBits(x));
}

static INLINE CONST double mulsign(double x, double y) {
  return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (INT64_C(1) << 63)));
}

static INLINE CONST double copysignk(double x, double y) {
  return longBitsToDouble((doubleToRawLongBits(x) & ~(INT64_C(1) << 63)) ^ (doubleToRawLongBits(y) & (INT64_C(1) << 63)));
}

static INLINE CONST double sign(double d) { return mulsign(1, d); }
static INLINE CONST double mla(double x, double y, double z) { return x * y + z; }
static INLINE CONST double rintk(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
static INLINE CONST int ceilk(double x) { return (int)x + (x < 0 ? 0 : 1); }
static INLINE CONST double trunck(double x) { return (double)(int)x; }
static INLINE CONST double fmink(double x, double y) { return x < y ? x : y; }
static INLINE CONST double fmaxk(double x, double y) { return x > y ? x : y; }

static INLINE CONST int xisnan(double x) { return x != x; }
static INLINE CONST int xisinf(double x) { return x == SLEEF_INFINITY || x == -SLEEF_INFINITY; }
static INLINE CONST int xisminf(double x) { return x == -SLEEF_INFINITY; }
static INLINE CONST int xispinf(double x) { return x == SLEEF_INFINITY; }
static INLINE CONST int xisnegzero(double x) { return doubleToRawLongBits(x) == doubleToRawLongBits(-0.0); }
static INLINE CONST int xisnumber(double x) { return !xisinf(x) && !xisnan(x); }

static INLINE CONST int xisint(double d) {
  double x = d - (double)(INT64_C(1) << 31) * (int)(d * (1.0 / (INT64_C(1) << 31)));
  return (x == (int)x) || (fabsk(d) >= (double)(INT64_C(1) << 53));
}

static INLINE CONST int xisodd(double d) {
  double x = d - (double)(INT64_C(1) << 31) * (int)(d * (1.0 / (INT64_C(1) << 31)));
  return (1 & (int)x) != 0 && fabsk(d) < (double)(INT64_C(1) << 53);
}

static INLINE CONST double pow2i(int q) {
  return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
}

static INLINE CONST double ldexpk(double x, int q) {
  double u;
  int m;
  m = q >> 31;
  m = (((m + q) >> 9) - m) << 7;
  q = q - (m << 2);
  m += 0x3ff;
  m = m < 0     ? 0     : m;
  m = m > 0x7ff ? 0x7ff : m;
  u = longBitsToDouble(((int64_t)m) << 52);
  x = x * u * u * u * u;
  u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
  return x * u;
}

static INLINE CONST double ldexp2k(double d, int e) { // faster than ldexpk, short reach
  return d * pow2i(e >> 1) * pow2i(e - (e >> 1));
}

static INLINE CONST double ldexp3k(double d, int e) { // very fast, no denormal
  return longBitsToDouble(doubleToRawLongBits(d) + (((int64_t)e) << 52));
}

EXPORT CONST double xldexp(double x, int exp) {
  if (exp >  2100) exp =  2100;
  if (exp < -2100) exp = -2100;
  
  int e0 = exp >> 2;
  if (exp < 0) e0++;
  if (-100 < exp && exp < 100) e0 = 0;
  int e1 = exp - (e0 << 2);

  double p = pow2i(e0);
  double ret = x * pow2i(e1) * p * p * p * p;
  
  return ret;
}

static INLINE CONST int ilogbk(double d) {
  int m = d < 4.9090934652977266E-91;
  d = m ? 2.037035976334486E90 * d : d;
  int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
  q = m ? q - (300 + 0x03ff) : q - 0x03ff;
  return q;
}

// ilogb2k is similar to ilogbk, but the argument has to be a
// normalized FP value.
static INLINE CONST int ilogb2k(double d) {
  return ((doubleToRawLongBits(d) >> 52) & 0x7ff) - 0x3ff;
}

EXPORT CONST int xilogb(double d) {
  int e = ilogbk(fabsk(d));
  e = d == 0.0  ? SLEEF_FP_ILOGB0 : e;
  e = xisnan(d) ? SLEEF_FP_ILOGBNAN : e;
  e = xisinf(d) ? INT_MAX : e;
  return e;
}

//

#ifndef NDEBUG
static int checkfp(double x) {
  if (xisinf(x) || xisnan(x)) return 1;
  return 0;
}
#endif

static INLINE CONST double upper(double d) {
  return longBitsToDouble(doubleToRawLongBits(d) & INT64_C(0xfffffffff8000000));
}

static INLINE CONST Sleef_double2 dd(double h, double l) {
  Sleef_double2 ret;
  ret.x = h; ret.y = l;
  return ret;
}

static INLINE CONST Sleef_double2 ddnormalize_d2_d2(Sleef_double2 t) {
  Sleef_double2 s;

  s.x = t.x + t.y;
  s.y = t.x - s.x + t.y;

  return s;
}

static INLINE CONST Sleef_double2 ddscale_d2_d2_d(Sleef_double2 d, double s) {
  Sleef_double2 r;

  r.x = d.x * s;
  r.y = d.y * s;

  return r;
}

static INLINE CONST Sleef_double2 ddneg_d2_d2(Sleef_double2 d) {
  Sleef_double2 r;

  r.x = -d.x;
  r.y = -d.y;

  return r;
}

static INLINE CONST Sleef_double2 ddabs_d2_d2(Sleef_double2 x) {
  return dd(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y);
}

/*
 * ddadd and ddadd2 are functions for double-double addition.  ddadd
 * is simpler and faster than ddadd2, but it requires the absolute
 * value of first argument to be larger than the second argument. The
 * exact condition that should be met is checked if NDEBUG macro is
 * not defined.
 *
 * Please note that if the results won't be used, it is no problem to
 * feed arguments that do not meet this condition. You will see
 * warning messages if you turn off NDEBUG macro and run tester2, but
 * this is normal.
 * 
 * Please see :
 * Jonathan Richard Shewchuk, Adaptive Precision Floating-Point
 * Arithmetic and Fast Robust Geometric Predicates, Discrete &
 * Computational Geometry 18:305-363, 1997.
 */

static INLINE CONST Sleef_double2 ddadd_d2_d_d(double x, double y) {
  // |x| >= |y|

  Sleef_double2 r;

#ifndef NDEBUG
  if (!(checkfp(x) || checkfp(y) || fabsk(x) >= fabsk(y) || (fabsk(x+y) <= fabsk(x) && fabsk(x+y) <= fabsk(y)))) {
    fprintf(stderr, "[ddadd_d2_d_d : %g, %g]\n", x, y);
    fflush(stderr);
  }
#endif

  r.x = x + y;
  r.y = x - r.x + y;

  return r;
}

static INLINE CONST Sleef_double2 ddadd2_d2_d_d(double x, double y) {
  Sleef_double2 r;

  r.x = x + y;
  double v = r.x - x;
  r.y = (x - (r.x - v)) + (y - v);

  return r;
}

static INLINE CONST Sleef_double2 ddadd_d2_d2_d(Sleef_double2 x, double y) {
  // |x| >= |y|

  Sleef_double2 r;

#ifndef NDEBUG
  if (!(checkfp(x.x) || checkfp(y) || fabsk(x.x) >= fabsk(y) || (fabsk(x.x+y) <= fabsk(x.x) && fabsk(x.x+y) <= fabsk(y)))) {
    fprintf(stderr, "[ddadd_d2_d2_d : %g %g]\n", x.x, y);
    fflush(stderr);
  }
#endif

  r.x = x.x + y;
  r.y = x.x - r.x + y + x.y;

  return r;
}

static INLINE CONST Sleef_double2 ddadd2_d2_d2_d(Sleef_double2 x, double y) {
  Sleef_double2 r;

  r.x  = x.x + y;
  double v = r.x - x.x;
  r.y = (x.x - (r.x - v)) + (y - v);
  r.y += x.y;

  return r;
}

static INLINE CONST Sleef_double2 ddadd_d2_d_d2(double x, Sleef_double2 y) {
  // |x| >= |y|

  Sleef_double2 r;

#ifndef NDEBUG
  if (!(checkfp(x) || checkfp(y.x) || fabsk(x) >= fabsk(y.x) || (fabsk(x+y.x) <= fabsk(x) && fabsk(x+y.x) <= fabsk(y.x)))) {
    fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]\n", x, y.x);
    fflush(stderr);
  }
#endif

  r.x = x + y.x;
  r.y = x - r.x + y.x + y.y;

  return r;
}

static INLINE CONST Sleef_double2 ddadd2_d2_d_d2(double x, Sleef_double2 y) {
  Sleef_double2 r;

  r.x  = x + y.x;
  double v = r.x - x;
  r.y = (x - (r.x - v)) + (y.x - v) + y.y;

  return r;
}

static INLINE CONST double ddadd2_d_d_d2(double x, Sleef_double2 y) { return y.y + y.x + x; }

static INLINE CONST Sleef_double2 ddadd_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
  // |x| >= |y|

  Sleef_double2 r;

#ifndef NDEBUG
  if (!(x.x == 0 || checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x+y.x) <= fabsk(x.x) && fabsk(x.x+y.x) <= fabsk(y.x)))) {
    fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]\n", x.x, y.x);
    fflush(stderr);
  }
#endif

  r.x = x.x + y.x;
  r.y = x.x - r.x + y.x + x.y + y.y;

  return r;
}

static INLINE CONST Sleef_double2 ddadd2_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
  Sleef_double2 r;

  r.x  = x.x + y.x;
  double v = r.x - x.x;
  r.y = (x.x - (r.x - v)) + (y.x - v);
  r.y += x.y + y.y;

  return r;
}

static INLINE CONST Sleef_double2 ddsub_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
  // |x| >= |y|

  Sleef_double2 r;

#ifndef NDEBUG
  if (!(checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x-y.x) <= fabsk(x.x) && fabsk(x.x-y.x) <= fabsk(y.x)))) {
    fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]\n", x.x, y.x);
    fflush(stderr);
  }
#endif

  r.x = x.x - y.x;
  r.y = x.x - r.x - y.x + x.y - y.y;

  return r;
}

static INLINE CONST Sleef_double2 dddiv_d2_d2_d2(Sleef_double2 n, Sleef_double2 d) {
  double t = 1.0 / d.x;
  double dh  = upper(d.x), dl  = d.x - dh;
  double th  = upper(t  ), tl  = t   - th;
  double nhh = upper(n.x), nhl = n.x - nhh;

  Sleef_double2 q;

  q.x = n.x * t;

  double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);

  q.y = t * (n.y - q.x * d.y) + u;

  return q;
}

static INLINE CONST Sleef_double2 ddmul_d2_d_d(double x, double y) {
  double xh = upper(x), xl = x - xh;
  double yh = upper(y), yl = y - yh;
  Sleef_double2 r;

  r.x = x * y;
  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;

  return r;
}

static INLINE CONST Sleef_double2 ddmul_d2_d2_d(Sleef_double2 x, double y) {
  double xh = upper(x.x), xl = x.x - xh;
  double yh = upper(y  ), yl = y   - yh;
  Sleef_double2 r;

  r.x = x.x * y;
  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;

  return r;
}

static INLINE CONST Sleef_double2 ddmul_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
  double xh = upper(x.x), xl = x.x - xh;
  double yh = upper(y.x), yl = y.x - yh;
  Sleef_double2 r;

  r.x = x.x * y.x;
  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;

  return r;
}

static INLINE CONST double ddmul_d_d2_d2(Sleef_double2 x, Sleef_double2 y) {
  double xh = upper(x.x), xl = x.x - xh;
  double yh = upper(y.x), yl = y.x - yh;
  
  return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh;
}

static INLINE CONST Sleef_double2 ddsqu_d2_d2(Sleef_double2 x) {
  double xh = upper(x.x), xl = x.x - xh;
  Sleef_double2 r;

  r.x = x.x * x.x;
  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);

  return r;
}

static INLINE CONST double ddsqu_d_d2(Sleef_double2 x) {
  double xh = upper(x.x), xl = x.x - xh;

  return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh;
}

static INLINE CONST Sleef_double2 ddrec_d2_d(double d) {
  double t = 1.0 / d;
  double dh = upper(d), dl = d - dh;
  double th = upper(t), tl = t - th;
  Sleef_double2 q;

  q.x = t;
  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);

  return q;
}

static INLINE CONST Sleef_double2 ddrec_d2_d2(Sleef_double2 d) {
  double t = 1.0 / d.x;
  double dh = upper(d.x), dl = d.x - dh;
  double th = upper(t  ), tl = t   - th;
  Sleef_double2 q;

  q.x = t;
  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t);

  return q;
}

static INLINE CONST Sleef_double2 ddsqrt_d2_d2(Sleef_double2 d) {
  double t = SQRT(d.x + d.y);
  return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5);
}

static INLINE CONST Sleef_double2 ddsqrt_d2_d(double d) {
  double t = SQRT(d);
  return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5);
}

//

static INLINE CONST double atan2k(double y, double x) {
  double s, t, u;
  int q = 0;

  if (x < 0) { x = -x; q = -2; }
  if (y > x) { t = x; x = y; y = -t; q += 1; }

  s = y / x;
  t = s * s;

  double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8;
  u = POLY19(t, t2, t4, t8, t16,
	     -1.88796008463073496563746e-05,
	     0.000209850076645816976906797,
	     -0.00110611831486672482563471,
	     0.00370026744188713119232403,
	     -0.00889896195887655491740809,
	     0.016599329773529201970117,
	     -0.0254517624932312641616861,
	     0.0337852580001353069993897,
	     -0.0407629191276836500001934,
	     0.0466667150077840625632675,
	     -0.0523674852303482457616113,
	     0.0587666392926673580854313,
	     -0.0666573579361080525984562,
	     0.0769219538311769618355029,
	     -0.090908995008245008229153,
	     0.111111105648261418443745,
	     -0.14285714266771329383765,
	     0.199999999996591265594148,
	     -0.333333333333311110369124);

  t = u * t * s + s;
  t = q * (M_PI/2) + t;

  return t;
}

EXPORT CONST double xatan2(double y, double x) {
  double r = atan2k(fabsk(y), x);

  r = mulsign(r, x);
  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);
  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);
  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);

  return xisnan(x) || xisnan(y) ? SLEEF_NAN : mulsign(r, y);
}

EXPORT CONST double xasin(double d) {
  int o = fabsk(d) < 0.5;
  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), x = o ? fabsk(d) : SQRT(x2), u;

  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u = mla(u, x * x2, x);
  
  double r = o ? u : (M_PI/2 - 2*u);
  r = mulsign(r, d);

  return r;
}

EXPORT CONST double xacos(double d) {
  int o = fabsk(d) < 0.5;
  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;
  double x = o ? fabsk(d) : SQRT(x2);
  x = fabsk(d) == 1.0 ? 0 : x;

  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u *= x * x2;
  
  double y = 3.1415926535897932/2 - (mulsign(x, d) + mulsign(u, d));
  x += u;
  double r = o ? y : (x*2);
  if (!o && d < 0) r = ddadd_d2_d2_d(dd(3.141592653589793116, 1.2246467991473532072e-16), -r).x;

  return r;
}

EXPORT CONST double xatan(double s) {
  double t, u;
  int q = 0;

  if (sign(s) == -1) { s = -s; q = 2; }
  if (s > 1) { s = 1.0 / s; q |= 1; }

  t = s * s;

  double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8;
  u = POLY19(t, t2, t4, t8, t16,
	     -1.88796008463073496563746e-05,
	     0.000209850076645816976906797,
	     -0.00110611831486672482563471,
	     0.00370026744188713119232403,
	     -0.00889896195887655491740809,
	     0.016599329773529201970117,
	     -0.0254517624932312641616861,
	     0.0337852580001353069993897,
	     -0.0407629191276836500001934,
	     0.0466667150077840625632675,
	     -0.0523674852303482457616113,
	     0.0587666392926673580854313,
	     -0.0666573579361080525984562,
	     0.0769219538311769618355029,
	     -0.090908995008245008229153,
	     0.111111105648261418443745,
	     -0.14285714266771329383765,
	     0.199999999996591265594148,
	     -0.333333333333311110369124);

  t = s + s * (t * u);

  if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
  if ((q & 2) != 0) t = -t;

  return t;
}

static Sleef_double2 atan2k_u1(Sleef_double2 y, Sleef_double2 x) {
  double u;
  Sleef_double2 s, t;
  int q = 0;

  if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; }
  if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; }

  s = dddiv_d2_d2_d2(y, x);
  t = ddsqu_d2_d2(s);
  t = ddnormalize_d2_d2(t);

  double t2 = t.x * t.x, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8;
  u = POLY16(t.x, t2, t4, t8,
	     1.06298484191448746607415e-05,
	     -0.000125620649967286867384336,
	     0.00070557664296393412389774,
	     -0.00251865614498713360352999,
	     0.00646262899036991172313504,
	     -0.0128281333663399031014274,
	     0.0208024799924145797902497,
	     -0.0289002344784740315686289,
	     0.0359785005035104590853656,
	     -0.041848579703592507506027,
	     0.0470843011653283988193763,
	     -0.0524914210588448421068719,
	     0.0587946590969581003860434,
	     -0.0666620884778795497194182,
	     0.0769225330296203768654095,
	     -0.0909090442773387574781907);
  u = mla(u, t.x, 0.111111108376896236538123);
  u = mla(u, t.x, -0.142857142756268568062339);
  u = mla(u, t.x, 0.199999999997977351284817);
  u = mla(u, t.x, -0.333333333333317605173818);

  t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u));

  if (fabsk(s.x) < 1e-200) t = s;
  t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t);
  
  return t;
}

EXPORT CONST double xatan2_u1(double y, double x) {
  if (fabsk(x) < 5.5626846462680083984e-309) { y *= (UINT64_C(1) << 53); x *= (UINT64_C(1) << 53); } // nexttoward((1.0 / DBL_MAX), 1)
  Sleef_double2 d = atan2k_u1(dd(fabsk(y), 0), dd(x, 0));
  double r = d.x + d.y;

  r = mulsign(r, x);
  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);
  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);
  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);

  return xisnan(x) || xisnan(y) ? SLEEF_NAN : mulsign(r, y);
}

EXPORT CONST double xasin_u1(double d) {
  int o = fabsk(d) < 0.5;
  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;
  Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2);
  x = fabsk(d) == 1.0 ? dd(0, 0) : x;

  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u *= x2 * x.x;
  
  Sleef_double2 y = ddadd_d2_d2_d(ddsub_d2_d2_d2(dd(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), -u);
  double r = o ? (u + x.x) : ((y.x + y.y)*2);
  r = mulsign(r, d);

  return r;
}

EXPORT CONST double xacos_u1(double d) {
  int o = fabsk(d) < 0.5;
  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;
  Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2), w;
  x = fabsk(d) == 1.0 ? dd(0, 0) : x;

  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);
  
  u *= x.x * x2;

  Sleef_double2 y = ddsub_d2_d2_d2(dd(3.141592653589793116/2, 1.2246467991473532072e-16/2),
				   ddadd_d2_d_d(mulsign(x.x, d), mulsign(u, d)));
  x = ddadd_d2_d2_d(x, u);
  y = o ? y : ddscale_d2_d2_d(x, 2);
  if (!o && d < 0) y = ddsub_d2_d2_d2(dd(3.141592653589793116, 1.2246467991473532072e-16), y);
  
  return y.x + y.y;
}

EXPORT CONST double xatan_u1(double d) {
  Sleef_double2 d2 = atan2k_u1(dd(fabsk(d), 0), dd(1, 0));
  double r = d2.x + d2.y;
  if (xisinf(d)) r = 1.570796326794896557998982;
  return mulsign(r, d);
}

typedef struct {
  double d;
  int32_t i;
} di_t;

typedef struct {
  Sleef_double2 dd;
  int32_t i;
} ddi_t;

static INLINE CONST double orsign(double x, double y) {
  return longBitsToDouble(doubleToRawLongBits(x) | (doubleToRawLongBits(y) & (INT64_C(1) << 63)));
}

static CONST di_t rempisub(double x) {
  // This function is equivalent to :
  // di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) };
  di_t ret;
  double c = mulsign(INT64_C(1) << 52, x);
  double rint4x = fabsk(4*x) > INT64_C(1) << 52 ? (4*x) : orsign(mla(4, x, c) - c, x);
  double rintx  = fabsk(  x) > INT64_C(1) << 52 ?   x   : orsign(x + c - c       , x);
  ret.d = mla(-0.25, rint4x,      x);
  ret.i = mla(-4   , rintx , rint4x);
  return ret;
}

// Payne-Hanek like argument reduction
static CONST ddi_t rempi(double a) {
  Sleef_double2 x, y, z;
  di_t di;
  double t;
  int ex = ilogb2k(a) - 55, q = ex > (700-55) ? -64 : 0;
  a = ldexp3k(a, q);
  if (ex < 0) ex = 0;
  ex *= 4;
  x = ddmul_d2_d_d(a, Sleef_rempitabdp[ex]);
  di = rempisub(x.x);
  q = di.i;
  x.x = di.d;
  x = ddnormalize_d2_d2(x);
  y = ddmul_d2_d_d(a, Sleef_rempitabdp[ex+1]);
  x = ddadd2_d2_d2_d2(x, y);
  di = rempisub(x.x);
  q += di.i;
  x.x = di.d;
  x = ddnormalize_d2_d2(x);
  y = ddmul_d2_d2_d(dd(Sleef_rempitabdp[ex+2], Sleef_rempitabdp[ex+3]), a);
  x = ddadd2_d2_d2_d2(x, y);
  x = ddnormalize_d2_d2(x);
  x = ddmul_d2_d2_d2(x, dd(3.141592653589793116*2, 1.2246467991473532072e-16*2));
  ddi_t ret = { fabsk(a) < 0.7 ? dd(a, 0) : x, q };
  return ret;
}

EXPORT CONST double xsin(double d) {
  double u, s, t = d;
  int ql;

  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = rintk(d * M_1_PI);
    d = mla(ql, -PI_A2, d);
    d = mla(ql, -PI_B2, d);
  } else if (fabsk(d) < TRIGRANGEMAX) {
    double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24);
    ql = rintk(mla(d, M_1_PI, -dqh));

    d = mla(dqh, -PI_A, d);
    d = mla( ql, -PI_A, d);
    d = mla(dqh, -PI_B, d);
    d = mla( ql, -PI_B, d);
    d = mla(dqh, -PI_C, d);
    d = mla( ql, -PI_C, d);
    d = mla(dqh + ql, -PI_D, d);
  } else {
    ddi_t ddi = rempi(t);
    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 1) >> 2;
    if ((ddi.i & 1) != 0) {
      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x),
					  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x)));
    }
    d = ddi.dd.x + ddi.dd.y;
    if (xisinf(t) || xisnan(t)) d = SLEEF_NAN;
  }

  s = d * d;

  if ((ql & 1) != 0) d = -d;

  double s2 = s * s, s4 = s2 * s2;
  u = POLY8(s, s2, s4,
	    -7.97255955009037868891952e-18,
	    2.81009972710863200091251e-15,
	    -7.64712219118158833288484e-13,
	    1.60590430605664501629054e-10,
	    -2.50521083763502045810755e-08,
	    2.75573192239198747630416e-06,
	    -0.000198412698412696162806809,
	    0.00833333333333332974823815);
  u = mla(u, s, -0.166666666666666657414808);

  u = mla(s, u * d, d);

  if (xisnegzero(t)) u = t;

  return u;
}

EXPORT CONST double xsin_u1(double d) {
  double u;
  Sleef_double2 s, t, x;
  int ql;
  
  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = rintk(d * M_1_PI);
    u = mla(ql, -PI_A2, d);
    s = ddadd_d2_d_d (u,  ql * -PI_B2);
  } else if (fabsk(d) < TRIGRANGEMAX) {
    const double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24);
    ql = rintk(mla(d, M_1_PI, -dqh));

    u = mla(dqh, -PI_A, d);
    s = ddadd_d2_d_d  (u,  ql * -PI_A);
    s = ddadd2_d2_d2_d(s, dqh * -PI_B);
    s = ddadd2_d2_d2_d(s,  ql * -PI_B);
    s = ddadd2_d2_d2_d(s, dqh * -PI_C);
    s = ddadd2_d2_d2_d(s,  ql * -PI_C);
    s = ddadd_d2_d2_d (s, (dqh + ql) * -PI_D);
  } else {
    ddi_t ddi = rempi(d);
    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 1) >> 2;
    if ((ddi.i & 1) != 0) {
      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x),
					  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x)));
    }
    s = ddnormalize_d2_d2(ddi.dd);
    if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN;
  }

  t = s;
  s = ddsqu_d2_d2(s);

  double s2 = s.x * s.x, s4 = s2 * s2;
  u = POLY6(s.x, s2, s4,
	    2.72052416138529567917983e-15,
	    -7.6429259411395447190023e-13,
	    1.60589370117277896211623e-10,
	    -2.5052106814843123359368e-08,
	    2.75573192104428224777379e-06,
	    -0.000198412698412046454654947);
  u = mla(u, s.x, 0.00833333333333318056201922);

  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));
  u = ddmul_d_d2_d2(t, x);
  
  if ((ql & 1) != 0) u = -u;
  if (xisnegzero(d)) u = d;
  
  return u;
}

EXPORT CONST double xcos(double d) {
  double u, s, t = d;
  int ql;

  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = mla(2, rintk(d * M_1_PI - 0.5), 1);
    d = mla(ql, -PI_A2*0.5, d);
    d = mla(ql, -PI_B2*0.5, d);
  } else if (fabsk(d) < TRIGRANGEMAX) {
    double dqh = trunck(d * (M_1_PI / (INT64_C(1) << 23)) - 0.5 * (M_1_PI / (INT64_C(1) << 23)));
    ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(INT64_C(1) << 23))+1;
    dqh *= 1 << 24;

    d = mla(dqh, -PI_A*0.5, d);
    d = mla( ql, -PI_A*0.5, d);
    d = mla(dqh, -PI_B*0.5, d);
    d = mla( ql, -PI_B*0.5, d);
    d = mla(dqh, -PI_C*0.5, d);
    d = mla( ql, -PI_C*0.5, d);
    d = mla(dqh + ql , -PI_D*0.5, d);
  } else {
    ddi_t ddi = rempi(t);
    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 7) >> 1;
    if ((ddi.i & 1) == 0) {
      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x > 0 ? 1 : -1),
					  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x > 0 ? 1 : -1)));
    }
    d = ddi.dd.x + ddi.dd.y;
    if (xisinf(t) || xisnan(t)) d = SLEEF_NAN;
  }
  
  s = d * d;

  if ((ql & 2) == 0) d = -d;

  double s2 = s * s, s4 = s2 * s2;
  u = POLY8(s, s2, s4,
	    -7.97255955009037868891952e-18,
	    2.81009972710863200091251e-15,
	    -7.64712219118158833288484e-13,
	    1.60590430605664501629054e-10,
	    -2.50521083763502045810755e-08,
	    2.75573192239198747630416e-06,
	    -0.000198412698412696162806809,
	    0.00833333333333332974823815);
  u = mla(u, s, -0.166666666666666657414808);

  u = mla(s, u * d, d);

  return u;
}

EXPORT CONST double xcos_u1(double d) {
  double u;
  Sleef_double2 s, t, x;
  int ql;
  
  d = fabsk(d);

  if (d < TRIGRANGEMAX2) {
    ql = mla(2, rintk(d * M_1_PI - 0.5), 1);
    s = ddadd2_d2_d_d(d, ql * (-PI_A2*0.5));
    s = ddadd_d2_d2_d(s, ql * (-PI_B2*0.5));
  } else if (d < TRIGRANGEMAX) {
    double dqh = trunck(d * (M_1_PI / (INT64_C(1) << 23)) - 0.5 * (M_1_PI / (INT64_C(1) << 23)));
    ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(INT64_C(1) << 23))+1;
    dqh *= 1 << 24;

    u = mla(dqh, -PI_A*0.5, d);
    s = ddadd2_d2_d_d (u,  ql * (-PI_A*0.5));
    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));
    s = ddadd2_d2_d2_d(s,  ql * (-PI_B*0.5));
    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));
    s = ddadd2_d2_d2_d(s,  ql * (-PI_C*0.5));
    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));
  } else {
    ddi_t ddi = rempi(d);
    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 7) >> 1;
    if ((ddi.i & 1) == 0) {
      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x > 0 ? 1 : -1),
					  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x > 0 ? 1 : -1)));
    }
    s = ddnormalize_d2_d2(ddi.dd);
    if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN;
  }
  
  t = s;
  s = ddsqu_d2_d2(s);

  double s2 = s.x * s.x, s4 = s2 * s2;
  u = POLY6(s.x, s2, s4,
	    2.72052416138529567917983e-15,
	    -7.6429259411395447190023e-13,
	    1.60589370117277896211623e-10,
	    -2.5052106814843123359368e-08,
	    2.75573192104428224777379e-06,
	    -0.000198412698412046454654947);
  u = mla(u, s.x, 0.00833333333333318056201922);

  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));
  u = ddmul_d_d2_d2(t, x);
  
  if ((((int)ql) & 2) == 0) u = -u;

  return u;
}

EXPORT CONST Sleef_double2 xsincos(double d) {
  double u, s, t;
  Sleef_double2 r;
  int ql;

  s = d;

  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = rintk(s * (2 * M_1_PI));
    s = mla(ql, -PI_A2*0.5, s);
    s = mla(ql, -PI_B2*0.5, s);
  } else if (fabsk(d) < TRIGRANGEMAX) {
    double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);
    ql = rintk(d * (2 * M_1_PI) - dqh);

    s = mla(dqh, -PI_A * 0.5, s);
    s = mla( ql, -PI_A * 0.5, s);
    s = mla(dqh, -PI_B * 0.5, s);
    s = mla( ql, -PI_B * 0.5, s);
    s = mla(dqh, -PI_C * 0.5, s);
    s = mla( ql, -PI_C * 0.5, s);
    s = mla(dqh + ql, -PI_D * 0.5, s);
  } else {
    ddi_t ddi = rempi(d);
    ql = ddi.i;
    s = ddi.dd.x + ddi.dd.y;
    if (xisinf(d) || xisnan(d)) s = SLEEF_NAN;
  }  

  t = s;

  s = s * s;
  
  u = 1.58938307283228937328511e-10;
  u = mla(u, s, -2.50506943502539773349318e-08);
  u = mla(u, s, 2.75573131776846360512547e-06);
  u = mla(u, s, -0.000198412698278911770864914);
  u = mla(u, s, 0.0083333333333191845961746);
  u = mla(u, s, -0.166666666666666130709393);
  u = u * s * t;

  r.x = t + u;

  if (xisnegzero(d)) r.x = -0.0;

  u = -1.13615350239097429531523e-11;
  u = mla(u, s, 2.08757471207040055479366e-09);
  u = mla(u, s, -2.75573144028847567498567e-07);
  u = mla(u, s, 2.48015872890001867311915e-05);
  u = mla(u, s, -0.00138888888888714019282329);
  u = mla(u, s, 0.0416666666666665519592062);
  u = mla(u, s, -0.5);

  r.y = u * s + 1;

  if ((ql & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
  if ((ql & 2) != 0) { r.x = -r.x; }
  if (((ql+1) & 2) != 0) { r.y = -r.y; }

  return r;
}

EXPORT CONST Sleef_double2 xsincos_u1(double d) {
  double u;
  Sleef_double2 r, s, t, x;
  int ql;
  
  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = rintk(d * (2 * M_1_PI));
    u = mla(ql, -PI_A2*0.5, d);
    s = ddadd_d2_d_d (u,  ql * (-PI_B2*0.5));
  } else if (fabsk(d) < TRIGRANGEMAX) {
    const double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);
    ql = rintk(d * (2 * M_1_PI) - dqh);

    u = mla(dqh, -PI_A*0.5, d);
    s = ddadd_d2_d_d(u, ql * (-PI_A*0.5));
    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));
    s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5));
    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));
    s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5));
    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));
  } else {
    ddi_t ddi = rempi(d);
    ql = ddi.i;
    s = ddi.dd;
    if (xisinf(d) || xisnan(d)) s = dd(SLEEF_NAN, SLEEF_NAN);
  }
  
  t = s;

  s.x = ddsqu_d_d2(s);
  
  u = 1.58938307283228937328511e-10;
  u = mla(u, s.x, -2.50506943502539773349318e-08);
  u = mla(u, s.x, 2.75573131776846360512547e-06);
  u = mla(u, s.x, -0.000198412698278911770864914);
  u = mla(u, s.x, 0.0083333333333191845961746);
  u = mla(u, s.x, -0.166666666666666130709393);

  u *= s.x * t.x;

  x = ddadd_d2_d2_d(t, u);
  r.x = x.x + x.y;
  
  if (xisnegzero(d)) r.x = -0.0;

  u = -1.13615350239097429531523e-11;
  u = mla(u, s.x, 2.08757471207040055479366e-09);
  u = mla(u, s.x, -2.75573144028847567498567e-07);
  u = mla(u, s.x, 2.48015872890001867311915e-05);
  u = mla(u, s.x, -0.00138888888888714019282329);
  u = mla(u, s.x, 0.0416666666666665519592062);
  u = mla(u, s.x, -0.5);

  x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u));
  r.y = x.x + x.y;
  
  if ((ql & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }
  if ((ql & 2) != 0) { r.x = -r.x; }
  if (((ql+1) & 2) != 0) { r.y = -r.y; }

  return r;
}

EXPORT CONST Sleef_double2 xsincospi_u05(double d) {
  double u, s, t;
  Sleef_double2 r, x, s2;

  u = d * 4;
  int q = ceilk(u) & ~(int)1;
  
  s = u - (double)q;
  t = s;
  s = s * s;
  s2 = ddmul_d2_d_d(t, t);
  
  //
  
  u = -2.02461120785182399295868e-14;
  u = mla(u, s, 6.94821830580179461327784e-12);
  u = mla(u, s, -1.75724749952853179952664e-09);
  u = mla(u, s, 3.13361688966868392878422e-07);
  u = mla(u, s, -3.6576204182161551920361e-05);
  u = mla(u, s, 0.00249039457019271850274356);
  x = ddadd2_d2_d_d2(u * s, dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(0.785398163397448278999491, 3.06287113727155002607105e-17));

  x = ddmul_d2_d2_d(x, t);
  r.x = x.x + x.y;
  
  if (xisnegzero(d)) r.x = -0.0;
  
  //

  u = 9.94480387626843774090208e-16;
  u = mla(u, s, -3.89796226062932799164047e-13);
  u = mla(u, s, 1.15011582539996035266901e-10);
  u = mla(u, s, -2.4611369501044697495359e-08);
  u = mla(u, s, 3.59086044859052754005062e-06);
  u = mla(u, s, -0.000325991886927389905997954);
  x = ddadd2_d2_d_d2(u * s, dd(0.0158543442438155018914259, -1.04693272280631521908845e-18));
  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(-0.308425137534042437259529, -1.95698492133633550338345e-17));

  x = ddadd2_d2_d2_d(ddmul_d2_d2_d2(x, s2), 1);
  r.y = x.x + x.y;
  
  //

  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
  if ((q & 4) != 0) { r.x = -r.x; }
  if (((q+2) & 4) != 0) { r.y = -r.y; }

  if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; }
  if (xisinf(d)) { r.x = r.y = SLEEF_NAN; }

  return r;
}

EXPORT CONST Sleef_double2 xsincospi_u35(double d) {
  double u, s, t;
  Sleef_double2 r;

  u = d * 4;
  int q = ceilk(u) & ~(int)1;
  
  s = u - (double)q;
  t = s;
  s = s * s;
  
  //
  
  u = +0.6880638894766060136e-11;
  u = mla(u, s, -0.1757159564542310199e-8);
  u = mla(u, s, +0.3133616327257867311e-6);
  u = mla(u, s, -0.3657620416388486452e-4);
  u = mla(u, s, +0.2490394570189932103e-2);
  u = mla(u, s, -0.8074551218828056320e-1);
  u = mla(u, s, +0.7853981633974482790e+0);
  
  r.x = u * t;

  //

  u = -0.3860141213683794352e-12;
  u = mla(u, s, +0.1150057888029681415e-9);
  u = mla(u, s, -0.2461136493006663553e-7);
  u = mla(u, s, +0.3590860446623516713e-5);
  u = mla(u, s, -0.3259918869269435942e-3);
  u = mla(u, s, +0.1585434424381541169e-1);
  u = mla(u, s, -0.3084251375340424373e+0);
  u = mla(u, s, 1);

  r.y = u;

  //
  
  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
  if ((q & 4) != 0) { r.x = -r.x; }
  if (((q+2) & 4) != 0) { r.y = -r.y; }

  if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; }
  if (xisinf(d)) { r.x = r.y = SLEEF_NAN; }

  return r;
}

static INLINE CONST Sleef_double2 sinpik(double d) {
  double u, s, t;
  Sleef_double2 x, s2;

  u = d * 4;
  int q = ceilk(u) & ~1;
  int o = (q & 2) != 0;
  
  s = u - (double)q;
  t = s;
  s = s * s;
  s2 = ddmul_d2_d_d(t, t);
  
  //
  
  u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14;
  u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12);
  u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09);
  u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07);
  u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05);
  u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356);
  x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) :
		     dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) :
		      dd(0.785398163397448278999491, 3.06287113727155002607105e-17));

  x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0));
  x = o ? ddadd2_d2_d2_d(x, 1) : x;
  
  //

  if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; }

  return x;
}

EXPORT CONST double xsinpi_u05(double d) {
  Sleef_double2 x = sinpik(d);
  double r = x.x + x.y;

  if (xisnegzero(d)) r = -0.0;
  if (fabsk(d) > TRIGRANGEMAX3/4) r = 0; 
  if (xisinf(d)) r = SLEEF_NAN;

  return r;
}

static INLINE CONST Sleef_double2 cospik(double d) {
  double u, s, t;
  Sleef_double2 x, s2;

  u = d * 4;
  int q = ceilk(u) & ~1;
  int o = (q & 2) == 0;
  
  s = u - (double)q;
  t = s;
  s = s * s;
  s2 = ddmul_d2_d_d(t, t);
  
  //
  
  u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14;
  u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12);
  u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09);
  u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07);
  u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05);
  u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356);
  x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) :
		     dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) :
		      dd(0.785398163397448278999491, 3.06287113727155002607105e-17));

  x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0));
  x = o ? ddadd2_d2_d2_d(x, 1) : x;
  
  //

  if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; }

  return x;
}

EXPORT CONST double xcospi_u05(double d) {
  Sleef_double2 x = cospik(d);
  double r = x.x + x.y;

  if (fabsk(d) > TRIGRANGEMAX3/4) r = 1; 
  if (xisinf(d)) r = SLEEF_NAN;

  return r;
}

EXPORT CONST double xtan(double d) {
  double u, s, x, y;
  int ql;

  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = rintk(d * (2 * M_1_PI));
    x = mla(ql, -PI_A2*0.5, d);
    x = mla(ql, -PI_B2*0.5, x);
  } else if (fabsk(d) < 1e+6) {
    double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);
    ql = rintk(d * (2 * M_1_PI) - dqh);

    x = mla(dqh, -PI_A * 0.5, d);
    x = mla( ql, -PI_A * 0.5, x);
    x = mla(dqh, -PI_B * 0.5, x);
    x = mla( ql, -PI_B * 0.5, x);
    x = mla(dqh, -PI_C * 0.5, x);
    x = mla( ql, -PI_C * 0.5, x);
    x = mla(dqh + ql, -PI_D * 0.5, x);
  } else {
    ddi_t ddi = rempi(d);
    ql = ddi.i;
    x = ddi.dd.x + ddi.dd.y;
    if (xisinf(d) || xisnan(d)) x = SLEEF_NAN;
  }
  
  x *= 0.5;
  s = x * x;

  double s2 = s * s, s4 = s2 * s2;
  u = POLY8(s, s2, s4,
	    +0.3245098826639276316e-3,
	    +0.5619219738114323735e-3,
	    +0.1460781502402784494e-2,
	    +0.3591611540792499519e-2,
	    +0.8863268409563113126e-2,
	    +0.2186948728185535498e-1,
	    +0.5396825399517272970e-1,
	    +0.1333333333330500581e+0);

  u = mla(u, s, +0.3333333333333343695e+0);
  u = mla(s, u * x, x);

  y = mla(u, u, -1);
  x = -2 * u;

  if ((ql & 1) != 0) { double t = x; x = y; y = -t; }

  u = x / y;

  return u;
}

EXPORT CONST double xtan_u1(double d) {
  double u;
  Sleef_double2 s, t, x, y;
  int ql;
  
  if (fabsk(d) < TRIGRANGEMAX2) {
    ql = rintk(d * (2 * M_1_PI));
    u = mla(ql, -PI_A2*0.5, d);
    s = ddadd_d2_d_d(u,  ql * (-PI_B2*0.5));
  } else if (fabsk(d) < TRIGRANGEMAX) {
    const double dqh = trunck(d * (M_2_PI / (1 << 24))) * (double)(1 << 24);
    s = ddadd2_d2_d2_d(ddmul_d2_d2_d(dd(M_2_PI_H, M_2_PI_L), d), (d < 0 ? -0.5 : 0.5) - dqh);
    ql = s.x + s.y;

    u = mla(dqh, -PI_A*0.5, d);
    s = ddadd_d2_d_d  (u,  ql * (-PI_A*0.5));
    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));
    s = ddadd2_d2_d2_d(s,  ql * (-PI_B*0.5));
    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));
    s = ddadd2_d2_d2_d(s,  ql * (-PI_C*0.5));
    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));
  } else {
    ddi_t ddi = rempi(d);
    ql = ddi.i;
    s = ddi.dd;
    if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN;
  }
  
  t = ddscale_d2_d2_d(s, 0.5);
  s = ddsqu_d2_d2(t);

  double s2 = s.x * s.x, s4 = s2 * s2;
  u = POLY8(s.x, s2, s4,
	    +0.3245098826639276316e-3,
	    +0.5619219738114323735e-3,
	    +0.1460781502402784494e-2,
	    +0.3591611540792499519e-2,
	    +0.8863268409563113126e-2,
	    +0.2186948728185535498e-1,
	    +0.5396825399517272970e-1,
	    +0.1333333333330500581e+0);

  u = mla(u, s.x, +0.3333333333333343695e+0);
  x = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u));

  y = ddadd_d2_d_d2(-1, ddsqu_d2_d2(x));
  x = ddscale_d2_d2_d(x, -2);

  if ((ql & 1) != 0) { t = x; x = y; y = ddneg_d2_d2(t); }

  x = dddiv_d2_d2_d2(x, y);

  u = x.x + x.y;

  if (xisnegzero(d)) u = d;
  
  return u;
}

EXPORT CONST double xlog(double d) {
  double x, x2, t, m;
  int e;

  int o = d < DBL_MIN;
  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
  
  e = ilogb2k(d * (1.0/0.75));
  m = ldexp3k(d, -e);

  if (o) e -= 64;
  
  x = (m-1) / (m+1);
  x2 = x * x;

  double x4 = x2 * x2, x8 = x4 * x4;

  t = POLY7(x2, x4, x8,
	    0.153487338491425068243146,
	    0.152519917006351951593857,
	    0.181863266251982985677316,
	    0.222221366518767365905163,
	    0.285714294746548025383248,
	    0.399999999950799600689777,
	    0.6666666666667778740063);

  x = x * 2 + 0.693147180559945286226764 * e + x * x2 * t;
  
  if (xisinf(d)) x = SLEEF_INFINITY;
  if (d < 0 || xisnan(d)) x = SLEEF_NAN;
  if (d == 0) x = -SLEEF_INFINITY;

  return x;
}

EXPORT CONST double xexp(double d) {
  int q = (int)rintk(d * R_LN2);
  double s, u;

  s = mla(q, -L2U, d);
  s = mla(q, -L2L, s);

  double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4;
  u = POLY10(s, s2, s4, s8,
	     2.08860621107283687536341e-09,
	     2.51112930892876518610661e-08,
	     2.75573911234900471893338e-07,
	     2.75572362911928827629423e-06,
	     2.4801587159235472998791e-05,
	     0.000198412698960509205564975,
	     0.00138888888889774492207962,
	     0.00833333333331652721664984,
	     0.0416666666666665047591422,
	     0.166666666666666851703837);
  u = mla(u, s, +0.5);

  u = s * s * u + s + 1;
  u = ldexp2k(u, q);

  if (d > 709.78271114955742909217217426) u = SLEEF_INFINITY;
  if (d < -1000) u = 0;
  
  return u;
}

static INLINE CONST double expm1k(double d) {
  int q = (int)rintk(d * R_LN2);
  double s, u;

  s = mla(q, -L2U, d);
  s = mla(q, -L2L, s);
  
  double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4;
  u = POLY10(s, s2, s4, s8,
	     2.08860621107283687536341e-09,
	     2.51112930892876518610661e-08,
	     2.75573911234900471893338e-07,
	     2.75572362911928827629423e-06,
	     2.4801587159235472998791e-05,
	     0.000198412698960509205564975,
	     0.00138888888889774492207962,
	     0.00833333333331652721664984,
	     0.0416666666666665047591422,
	     0.166666666666666851703837);

  u = mla(s2, 0.5, s2 * s * u) + s;

  if (q != 0) u = ldexp2k(u + 1, q) - 1;
  
  return u;
}

static INLINE CONST Sleef_double2 logk(double d) {
  Sleef_double2 x, x2, s;
  double m, t;
  int e;

  int o = d < DBL_MIN;
  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
  
  e = ilogb2k(d * (1.0/0.75));
  m = ldexp3k(d, -e);

  if (o) e -= 64;
  
  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
  x2 = ddsqu_d2_d2(x);

  double x4 = x2.x * x2.x, x8 = x4 * x4, x16 = x8 * x8;
  t = POLY9(x2.x, x4, x8, x16,
	    0.116255524079935043668677,
	    0.103239680901072952701192,
	    0.117754809412463995466069,
	    0.13332981086846273921509,
	    0.153846227114512262845736,
	    0.181818180850050775676507,
	    0.222222222230083560345903,
	    0.285714285714249172087875,
	    0.400000000000000077715612);

  Sleef_double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17);
  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
  x = ddmul_d2_d2_d2(x2, x);
  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, c));
  x = ddmul_d2_d2_d2(x2, x);
  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(x, t));

  return s;
}

EXPORT CONST double xlog_u1(double d) {
  Sleef_double2 x, s;
  double m, t, x2;
  int e;

  int o = d < DBL_MIN;
  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
      
  e = ilogb2k(d * (1.0/0.75));
  m = ldexp3k(d, -e);

  if (o) e -= 64;
  
  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
  x2 = x.x * x.x;

  double x4 = x2 * x2, x8 = x4 * x4;
  t = POLY7(x2, x4, x8,
	    0.1532076988502701353e+0,
	    0.1525629051003428716e+0,
	    0.1818605932937785996e+0,
	    0.2222214519839380009e+0,
	    0.2857142932794299317e+0,
	    0.3999999999635251990e+0,
	    0.6666666666667333541e+0);

  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e);
  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
  s = ddadd_d2_d2_d(s, x2 * x.x * t);

  double r = s.x + s.y;
  
  if (xisinf(d)) r = SLEEF_INFINITY;
  if (d < 0 || xisnan(d)) r = SLEEF_NAN;
  if (d == 0) r = -SLEEF_INFINITY;

  return r;
}

static INLINE CONST double expk(Sleef_double2 d) {
  int q = (int)rintk((d.x + d.y) * R_LN2);
  Sleef_double2 s, t;
  double u;

  s = ddadd2_d2_d2_d(d, q * -L2U);
  s = ddadd2_d2_d2_d(s, q * -L2L);

  s = ddnormalize_d2_d2(s);

  double s2 = s.x * s.x, s4 = s2 * s2, s8 = s4 * s4;
  u = POLY10(s.x, s2, s4, s8,
	     2.51069683420950419527139e-08,
	     2.76286166770270649116855e-07,
	     2.75572496725023574143864e-06,
	     2.48014973989819794114153e-05,
	     0.000198412698809069797676111,
	     0.0013888888939977128960529,
	     0.00833333333332371417601081,
	     0.0416666666665409524128449,
	     0.166666666666666740681535,
	     0.500000000000000999200722);

  t = ddadd_d2_d_d2(1, s);
  t = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddsqu_d2_d2(s), u));

  u = ldexpk(t.x + t.y, q);

  if (d.x < -1000) u = 0;

  return u;
}

EXPORT CONST double xpow(double x, double y) {
  int yisint = xisint(y);
  int yisodd = yisint && xisodd(y);

  Sleef_double2 d = ddmul_d2_d2_d(logk(fabsk(x)), y);
  double result = expk(d);
  if (d.x > 709.78271114955742909217217426) result = SLEEF_INFINITY;

  result = xisnan(result) ? SLEEF_INFINITY : result;
  result *= (x > 0 ? 1 : (!yisint ? SLEEF_NAN : (yisodd ? -1 : 1)));

  double efx = mulsign(fabsk(x) - 1, y);
  if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : SLEEF_INFINITY);
  if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : SLEEF_INFINITY);
  if (xisnan(x) || xisnan(y)) result = SLEEF_NAN;
  if (y == 0 || x == 1) result = 1;

  return result;
}

static INLINE CONST Sleef_double2 expk2(Sleef_double2 d) {
  int q = (int)rintk((d.x + d.y) * R_LN2);
  Sleef_double2 s, t;
  double u;

  s = ddadd2_d2_d2_d(d, q * -L2U);
  s = ddadd2_d2_d2_d(s, q * -L2L);

  u = +0.1602472219709932072e-9;
  u = mla(u, s.x, +0.2092255183563157007e-8);
  u = mla(u, s.x, +0.2505230023782644465e-7);
  u = mla(u, s.x, +0.2755724800902135303e-6);
  u = mla(u, s.x, +0.2755731892386044373e-5);
  u = mla(u, s.x, +0.2480158735605815065e-4);
  u = mla(u, s.x, +0.1984126984148071858e-3);
  u = mla(u, s.x, +0.1388888888886763255e-2);
  u = mla(u, s.x, +0.8333333333333347095e-2);
  u = mla(u, s.x, +0.4166666666666669905e-1);

  t = ddadd2_d2_d2_d(ddmul_d2_d2_d(s, u), +0.1666666666666666574e+0);
  t = ddadd2_d2_d2_d(ddmul_d2_d2_d2(s, t), 0.5);
  t = ddadd2_d2_d2_d2(s, ddmul_d2_d2_d2(ddsqu_d2_d2(s), t));

  t = ddadd2_d2_d_d2(1, t);

  t.x = ldexp2k(t.x, q);
  t.y = ldexp2k(t.y, q);

  return d.x < -1000 ? dd(0, 0) : t;
}

EXPORT CONST double xsinh(double x) {
  double y = fabsk(x);
  Sleef_double2 d = expk2(dd(y, 0));
  d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d));
  y = (d.x + d.y) * 0.5;

  y = fabsk(x) > 710 ? SLEEF_INFINITY : y;
  y = xisnan(y) ? SLEEF_INFINITY : y;
  y = mulsign(y, x);
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

EXPORT CONST double xcosh(double x) {
  double y = fabsk(x);
  Sleef_double2 d = expk2(dd(y, 0));
  d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d));
  y = (d.x + d.y) * 0.5;

  y = fabsk(x) > 710 ? SLEEF_INFINITY : y;
  y = xisnan(y) ? SLEEF_INFINITY : y;
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

EXPORT CONST double xtanh(double x) {
  double y = fabsk(x);
  Sleef_double2 d = expk2(dd(y, 0));
  Sleef_double2 e = ddrec_d2_d2(d);
  d = dddiv_d2_d2_d2(ddsub_d2_d2_d2(d, e), ddadd_d2_d2_d2(d, e));
  y = d.x + d.y;

  y = fabsk(x) > 18.714973875 ? 1.0 : y;
  y = xisnan(y) ? 1.0 : y;
  y = mulsign(y, x);
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

EXPORT CONST double xsinh_u35(double x) {
  double e = expm1k(fabsk(x));
  double y = (e + 2) / (e + 1) * (0.5 * e);

  y = fabsk(x) > 709 ? SLEEF_INFINITY : y;
  y = xisnan(y) ? SLEEF_INFINITY : y;
  y = mulsign(y, x);
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

EXPORT CONST double xcosh_u35(double x) {
  double e = xexp(fabsk(x));
  double y = 0.5 / e + 0.5 * e;

  y = fabsk(x) > 709 ? SLEEF_INFINITY : y;
  y = xisnan(y) ? SLEEF_INFINITY : y;
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

EXPORT CONST double xtanh_u35(double x) {
  double y = fabsk(x);
  double d = expm1k(2*y);
  y = d / (d + 2);

  y = fabsk(x) > 18.714973875 ? 1.0 : y;
  y = xisnan(y) ? 1.0 : y;
  y = mulsign(y, x);
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

static INLINE CONST Sleef_double2 logk2(Sleef_double2 d) {
  Sleef_double2 x, x2, m, s;
  double t;
  int e;
  
  e = ilogbk(d.x * (1.0/0.75));

  m.x = ldexp2k(d.x, -e);
  m.y = ldexp2k(d.y, -e);

  x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1));
  x2 = ddsqu_d2_d2(x);

  double x4 = x2.x * x2.x, x8 = x4 * x4;
  t = POLY7(x2.x, x4, x8,
	    0.13860436390467167910856,
	    0.131699838841615374240845,
	    0.153914168346271945653214,
	    0.181816523941564611721589,
	    0.22222224632662035403996,
	    0.285714285511134091777308,
	    0.400000000000914013309483);
  t = mla(t, x2.x, 0.666666666666664853302393);

  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t));

  return s;
}

EXPORT CONST double xasinh(double x) {
  double y = fabsk(x);
  Sleef_double2 d;

  d = y > 1 ? ddrec_d2_d(x) : dd(y, 0);
  d = ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(d), 1));
  d = y > 1 ? ddmul_d2_d2_d(d, y) : d;
  
  d = logk2(ddnormalize_d2_d2(ddadd_d2_d2_d(d, x)));
  y = d.x + d.y;

  y = (fabsk(x) > SQRT_DBL_MAX || xisnan(y)) ? mulsign(SLEEF_INFINITY, x) : y;
  y = xisnan(x) ? SLEEF_NAN : y;
  y = xisnegzero(x) ? -0.0 : y;
  
  return y;
}

EXPORT CONST double xacosh(double x) {
  Sleef_double2 d = logk2(ddadd2_d2_d2_d(ddmul_d2_d2_d2(ddsqrt_d2_d2(ddadd2_d2_d_d(x, 1)), ddsqrt_d2_d2(ddadd2_d2_d_d(x, -1))), x));
  double y = d.x + d.y;

  y = (x > SQRT_DBL_MAX || xisnan(y)) ? SLEEF_INFINITY : y;
  y = x == 1.0 ? 0.0 : y;
  y = x < 1.0 ? SLEEF_NAN : y;
  y = xisnan(x) ? SLEEF_NAN : y;

  return y;
}

EXPORT CONST double xatanh(double x) {
  double y = fabsk(x);
  Sleef_double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y)));
  y = y > 1.0 ? SLEEF_NAN : (y == 1.0 ? SLEEF_INFINITY : (d.x + d.y) * 0.5);

  y = mulsign(y, x);
  y = (xisinf(x) || xisnan(y)) ? SLEEF_NAN : y;

  return y;
}

//

EXPORT CONST double xcbrt(double d) { // max error : 2 ulps
  double x, y, q = 1.0;
  int e, r;

  e = ilogbk(fabsk(d))+1;
  d = ldexp2k(d, -e);
  r = (e + 6144) % 3;
  q = (r == 1) ? 1.2599210498948731647672106 : q;
  q = (r == 2) ? 1.5874010519681994747517056 : q;
  q = ldexp2k(q, (e + 6144) / 3 - 2048);

  q = mulsign(q, d);
  d = fabsk(d);

  x = -0.640245898480692909870982;
  x = mla(x, d, 2.96155103020039511818595);
  x = mla(x, d, -5.73353060922947843636166);
  x = mla(x, d, 6.03990368989458747961407);
  x = mla(x, d, -3.85841935510444988821632);
  x = mla(x, d, 2.2307275302496609725722);

  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
  y = d * x * x;
  y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;

  return y;
}

EXPORT CONST double xcbrt_u1(double d) {
  double x, y, z;
  Sleef_double2 q2 = dd(1, 0), u, v;
  int e, r;

  e = ilogbk(fabsk(d))+1;
  d = ldexp2k(d, -e);
  r = (e + 6144) % 3;
  q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2;
  q2 = (r == 2) ? dd(1.5874010519681995834, -1.0869008194197822986e-16) : q2;

  q2.x = mulsign(q2.x, d); q2.y = mulsign(q2.y, d);
  d = fabsk(d);

  x = -0.640245898480692909870982;
  x = mla(x, d, 2.96155103020039511818595);
  x = mla(x, d, -5.73353060922947843636166);
  x = mla(x, d, 6.03990368989458747961407);
  x = mla(x, d, -3.85841935510444988821632);
  x = mla(x, d, 2.2307275302496609725722);

  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);

  z = x;

  u = ddmul_d2_d_d(x, x);
  u = ddmul_d2_d2_d2(u, u);
  u = ddmul_d2_d2_d(u, d);
  u = ddadd2_d2_d2_d(u, -x);
  y = u.x + u.y;

  y = -2.0 / 3.0 * y * z;
  v = ddadd2_d2_d2_d(ddmul_d2_d_d(z, z), y);
  v = ddmul_d2_d2_d(v, d);
  v = ddmul_d2_d2_d2(v, q2);
  z = ldexp2k(v.x + v.y, (e + 6144) / 3 - 2048);

  if (xisinf(d)) { z = mulsign(SLEEF_INFINITY, q2.x); }
  if (d == 0) { z = mulsign(0, q2.x); }

  return z;
}

EXPORT CONST double xexp2(double d) {
  int q = (int)rintk(d);
  double s, u;

  s = d - q;

  double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4;
  u = POLY10(s, s2, s4, s8,
	     +0.4434359082926529454e-9,
	     +0.7073164598085707425e-8,
	     +0.1017819260921760451e-6,
	     +0.1321543872511327615e-5,
	     +0.1525273353517584730e-4,
	     +0.1540353045101147808e-3,
	     +0.1333355814670499073e-2,
	     +0.9618129107597600536e-2,
	     +0.5550410866482046596e-1,
	     +0.2402265069591012214e+0);
  u = mla(u, s, +0.6931471805599452862e+0);

  u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x;

  u = ldexp2k(u, q);

  if (d >= 1024) u = SLEEF_INFINITY;
  if (d < -2000) u = 0;
  
  return u;
}

EXPORT CONST double xexp2_u35(double d) {
  int q = (int)rintk(d);
  double s, u;

  s = d - q;

  u = +0.4434359082926529454e-9;
  u = mla(u, s, +0.7073164598085707425e-8);
  u = mla(u, s, +0.1017819260921760451e-6);
  u = mla(u, s, +0.1321543872511327615e-5);
  u = mla(u, s, +0.1525273353517584730e-4);
  u = mla(u, s, +0.1540353045101147808e-3);
  u = mla(u, s, +0.1333355814670499073e-2);
  u = mla(u, s, +0.9618129107597600536e-2);
  u = mla(u, s, +0.5550410866482046596e-1);
  u = mla(u, s, +0.2402265069591012214e+0);
  u = mla(u, s, +0.6931471805599452862e+0);
  u = mla(u, s, +0.1000000000000000000e+1);

  u = ldexp2k(u, q);

  if (d >= 1024) u = SLEEF_INFINITY;
  if (d < -2000) u = 0;
  
  return u;
}

EXPORT CONST double xexp10(double d) {
  int q = (int)rintk(d * LOG10_2);
  double s, u;
  
  s = mla(q, -L10U, d);
  s = mla(q, -L10L, s);
  
  u = +0.2411463498334267652e-3;
  u = mla(u, s, +0.1157488415217187375e-2);
  u = mla(u, s, +0.5013975546789733659e-2);
  u = mla(u, s, +0.1959762320720533080e-1);
  u = mla(u, s, +0.6808936399446784138e-1);
  u = mla(u, s, +0.2069958494722676234e+0);
  u = mla(u, s, +0.5393829292058536229e+0);
  u = mla(u, s, +0.1171255148908541655e+1);
  u = mla(u, s, +0.2034678592293432953e+1);
  u = mla(u, s, +0.2650949055239205876e+1);
  u = mla(u, s, +0.2302585092994045901e+1);

  u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x;
  
  u = ldexp2k(u, q);
  
  if (d > 308.25471555991671) u = SLEEF_INFINITY; // log10(DBL_MAX)
  if (d < -350) u = 0;
  
  return u;
}

EXPORT CONST double xexp10_u35(double d) {
  int q = (int)rintk(d * LOG10_2);
  double s, u;
  
  s = mla(q, -L10U, d);
  s = mla(q, -L10L, s);
  
  u = +0.2411463498334267652e-3;
  u = mla(u, s, +0.1157488415217187375e-2);
  u = mla(u, s, +0.5013975546789733659e-2);
  u = mla(u, s, +0.1959762320720533080e-1);
  u = mla(u, s, +0.6808936399446784138e-1);
  u = mla(u, s, +0.2069958494722676234e+0);
  u = mla(u, s, +0.5393829292058536229e+0);
  u = mla(u, s, +0.1171255148908541655e+1);
  u = mla(u, s, +0.2034678592293432953e+1);
  u = mla(u, s, +0.2650949055239205876e+1);
  u = mla(u, s, +0.2302585092994045901e+1);
  u = mla(u, s, +0.1000000000000000000e+1);
  
  u = ldexp2k(u, q);
  
  if (d > 308.25471555991671) u = SLEEF_INFINITY;
  if (d < -350) u = 0;
  
  return u;
}

EXPORT CONST double xexpm1(double a) {
  Sleef_double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0);
  double x = d.x + d.y;
  if (a > 709.782712893383996732223) x = SLEEF_INFINITY; // log(DBL_MAX)
  if (a < -36.736800569677101399113302437) x = -1; // log(1 - nexttoward(1, 0))
  if (xisnegzero(a)) x = -0.0;
  return x;
}

EXPORT CONST double xlog10(double d) {
  Sleef_double2 x, s;
  double m, t, x2;
  int e;

  int o = d < DBL_MIN;
  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
      
  e = ilogb2k(d * (1.0/0.75));
  m = ldexp3k(d, -e);

  if (o) e -= 64;

  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
  x2 = x.x * x.x;

  double x4 = x2 * x2, x8 = x4 * x4;
  t = POLY7(x2, x4, x8,
	    +0.6653725819576758460e-1,
	    +0.6625722782820833712e-1,
	    +0.7898105214313944078e-1,
	    +0.9650955035715275132e-1,
	    +0.1240841409721444993e+0,
	    +0.1737177927454605086e+0,
	    +0.2895296546021972617e+0);
  
  s = ddmul_d2_d2_d(dd(0.30102999566398119802, -2.803728127785170339e-18), (double)e);
  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, dd(0.86858896380650363334, 1.1430059694096389311e-17)));
  s = ddadd_d2_d2_d(s, x2 * x.x * t);

  double r = s.x + s.y;
  
  if (xisinf(d)) r = SLEEF_INFINITY;
  if (d < 0 || xisnan(d)) r = SLEEF_NAN;
  if (d == 0) r = -SLEEF_INFINITY;

  return r;
}

EXPORT CONST double xlog2(double d) {
  Sleef_double2 x, s;
  double m, t, x2;
  int e;

  int o = d < DBL_MIN;
  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
      
  e = ilogb2k(d * (1.0/0.75));
  m = ldexp3k(d, -e);

  if (o) e -= 64;

  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
  x2 = x.x * x.x;

  double x4 = x2 * x2, x8 = x4 * x4;
  t = POLY7(x2, x4, x8,
	    +0.2211941750456081490e+0,
	    +0.2200768693152277689e+0,
	    +0.2623708057488514656e+0,
	    +0.3205977477944495502e+0,
	    +0.4121985945485324709e+0,
	    +0.5770780162997058982e+0,
	    +0.96179669392608091449);

  s = ddadd2_d2_d_d2(e, ddmul_d2_d2_d2(x, dd(2.885390081777926774, 6.0561604995516736434e-18)));
  s = ddadd2_d2_d2_d(s, x2 * x.x * t);
  
  double r = s.x + s.y;
  
  if (xisinf(d)) r = SLEEF_INFINITY;
  if (d < 0 || xisnan(d)) r = SLEEF_NAN;
  if (d == 0) r = -SLEEF_INFINITY;

  return r;
}

EXPORT CONST double xlog2_u35(double d) {
  double m, t, x, x2;
  int e;

  int o = d < DBL_MIN;
  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
      
  e = ilogb2k(d * (1.0/0.75));
  m = ldexp3k(d, -e);

  if (o) e -= 64;

  x = (m - 1) / (m + 1);
  x2 = x * x;

  t = +0.2211941750456081490e+0;
  t = mla(t, x2, +0.2200768693152277689e+0);
  t = mla(t, x2, +0.2623708057488514656e+0);
  t = mla(t, x2, +0.3205977477944495502e+0);
  t = mla(t, x2, +0.4121985945485324709e+0);
  t = mla(t, x2, +0.5770780162997058982e+0);
  t = mla(t, x2, +0.96179669392608091449  );

  Sleef_double2 s = ddadd_d2_d_d2(e, ddmul_d2_d_d(2.885390081777926774, x));
  double r = mla(t, x * x2, s.x + s.y);
  
  if (xisinf(d)) r = SLEEF_INFINITY;
  if (d < 0 || xisnan(d)) r = SLEEF_NAN;
  if (d == 0) r = -SLEEF_INFINITY;

  return r;
}

EXPORT CONST double xlog1p(double d) {
  Sleef_double2 x, s;
  double m, t, x2;
  int e;

  double dp1 = d + 1;
  
  int o = dp1 < DBL_MIN;
  if (o) dp1 *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);
      
  e = ilogb2k(dp1 * (1.0/0.75));

  t = ldexp3k(1, -e);
  m = mla(d, t, t - 1);
  
  if (o) e -= 64;

  x = dddiv_d2_d2_d2(dd(m, 0), ddadd_d2_d_d(2, m));
  x2 = x.x * x.x;

  double x4 = x2 * x2, x8 = x4 * x4;
  t = POLY7(x2, x4, x8,
	    0.1532076988502701353e+0,
	    0.1525629051003428716e+0,
	    0.1818605932937785996e+0,
	    0.2222214519839380009e+0,
	    0.2857142932794299317e+0,
	    0.3999999999635251990e+0,
	    0.6666666666667333541e+0);

  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e);
  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
  s = ddadd_d2_d2_d(s, x2 * x.x * t);

  double r = s.x + s.y;
  
  if (d > 1e+307) r = SLEEF_INFINITY;
  if (d < -1 || xisnan(d)) r = SLEEF_NAN;
  if (d == -1) r = -SLEEF_INFINITY;
  if (xisnegzero(d)) r = -0.0;

  return r;
}

//

EXPORT CONST double xfma(double x, double y, double z) {
  double h2 = x * y + z, q = 1;
  if (fabsk(h2) < 1e-300) {
    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;
    x *= c1;
    y *= c1;
    z *= c2;
    q = 1.0 / c2;
  }
  if (fabsk(h2) > 1e+299) {
    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;
    x *= 1.0 / c1;
    y *= 1.0 / c1;
    z *= 1. / c2;
    q = c2;
  }
  Sleef_double2 d = ddmul_d2_d_d(x, y);
  d = ddadd2_d2_d2_d(d, z);
  double ret = (x == 0 || y == 0) ? z : (d.x + d.y);
  if ((xisinf(z) && !xisinf(x) && !xisnan(x) && !xisinf(y) && !xisnan(y))) h2 = z;
  return (xisinf(h2) || xisnan(h2)) ? h2 : ret*q;
}

EXPORT CONST double xsqrt_u05(double d) {
  double q = 0.5;

  d = d < 0 ? SLEEF_NAN : d;
  
  if (d < 8.636168555094445E-78) {
    d *= 1.157920892373162E77;
    q = 2.9387358770557188E-39 * 0.5;
  }

  if (d > 1.3407807929942597e+154) {
    d *= 7.4583407312002070e-155;
    q = 1.1579208923731620e+77 * 0.5;
  }
  
  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
  double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));

  x = x * (1.5 - 0.5 * d * x * x);
  x = x * (1.5 - 0.5 * d * x * x);
  x = x * (1.5 - 0.5 * d * x * x) * d;

  Sleef_double2 d2 = ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(x, x)), ddrec_d2_d(x));

  double ret = (d2.x + d2.y) * q;

  ret = d == SLEEF_INFINITY ? SLEEF_INFINITY : ret;
  ret = d == 0 ? d : ret;

  return ret;
}

EXPORT CONST double xsqrt_u35(double d) { return xsqrt_u05(d); }
EXPORT CONST double xsqrt(double d) { return SQRT(d); }

EXPORT CONST double xfabs(double x) { return fabsk(x); }

EXPORT CONST double xcopysign(double x, double y) { return copysignk(x, y); }

EXPORT CONST double xfmax(double x, double y) {
  return y != y ? x : (x > y ? x : y);
}

EXPORT CONST double xfmin(double x, double y) {
  return y != y ? x : (x < y ? x : y);
}

EXPORT CONST double xfdim(double x, double y) {
  double ret = x - y;
  if (ret < 0 || x == y) ret = 0;
  return ret;
}

EXPORT CONST double xtrunc(double x) {
  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));
  fr = fr - (int32_t)fr;
  return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x);
}

EXPORT CONST double xfloor(double x) {
  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));
  fr = fr - (int32_t)fr;
  fr = fr < 0 ? fr+1.0 : fr;
  return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x);
}

EXPORT CONST double xceil(double x) {
  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));
  fr = fr - (int32_t)fr;
  fr = fr <= 0 ? fr : fr-1.0;
  return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x);
}

EXPORT CONST double xround(double d) {
  double x = d + 0.5;
  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));
  fr = fr - (int32_t)fr;
  if (fr == 0 && x <= 0) x--;
  fr = fr < 0 ? fr+1.0 : fr;
  x = d == 0.49999999999999994449 ? 0 : x;  // nextafter(0.5, 0)
  return (xisinf(d) || fabsk(d) >= (double)(INT64_C(1) << 52)) ? d : copysignk(x - fr, d);
}

EXPORT CONST double xrint(double d) {
  double c = mulsign(INT64_C(1) << 52, d);
  return fabsk(d) > INT64_C(1) << 52 ? d : orsign(d + c - c, d);
}

EXPORT CONST double xhypot_u05(double x, double y) {
  x = fabsk(x);
  y = fabsk(y);
  double min = fmink(x, y), n = min;
  double max = fmaxk(x, y), d = max;

  if (max < DBL_MIN) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; }
  Sleef_double2 t = dddiv_d2_d2_d2(dd(n, 0), dd(d, 0));
  t = ddmul_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(t), 1)), max);
  double ret = t.x + t.y;
  if (xisnan(ret)) ret = SLEEF_INFINITY;
  if (min == 0) ret = max;
  if (xisnan(x) || xisnan(y)) ret = SLEEF_NAN;
  if (x == SLEEF_INFINITY || y == SLEEF_INFINITY) ret = SLEEF_INFINITY;
  return ret;
}

EXPORT CONST double xhypot_u35(double x, double y) {
  x = fabsk(x);
  y = fabsk(y);
  double min = fmink(x, y);
  double max = fmaxk(x, y);

  double t = min / max;
  double ret = max * SQRT(1 + t*t);
  if (min == 0) ret = max;
  if (xisnan(x) || xisnan(y)) ret = SLEEF_NAN;
  if (x == SLEEF_INFINITY || y == SLEEF_INFINITY) ret = SLEEF_INFINITY;
  return ret;
}

EXPORT CONST double xnextafter(double x, double y) {
  union {
    double f;
    int64_t i;
  } cx;

  x = x == 0 ? mulsign(0, y) : x;
  cx.f = x;
  int c = (cx.i < 0) == (y < x);
  if (c) cx.i = -(cx.i ^ (UINT64_C(1) << 63));

  if (x != y) cx.i--;

  if (c) cx.i = -(cx.i ^ (UINT64_C(1) << 63));

  if (cx.f == 0 && x != 0) cx.f = mulsign(0, x);
  if (x == 0 && y == 0) cx.f = y;
  if (xisnan(x) || xisnan(y)) cx.f = SLEEF_NAN;
  
  return cx.f;
}

EXPORT CONST double xfrfrexp(double x) {
  union {
    double f;
    uint64_t u;
  } cx;

  if (fabsk(x) < DBL_MIN) x *= (UINT64_C(1) << 63);
  
  cx.f = x;
  cx.u &= ~UINT64_C(0x7ff0000000000000);
  cx.u |=  UINT64_C(0x3fe0000000000000);

  if (xisinf(x)) cx.f = mulsign(SLEEF_INFINITY, x);
  if (x == 0) cx.f = x;
  
  return cx.f;
}

EXPORT CONST int xexpfrexp(double x) {
  union {
    double f;
    uint64_t u;
  } cx;

  int ret = 0;
  
  if (fabsk(x) < DBL_MIN) { x *= (UINT64_C(1) << 63); ret = -63; }
  
  cx.f = x;
  ret += (int32_t)(((cx.u >> 52) & 0x7ff)) - 0x3fe;

  if (x == 0 || xisnan(x) || xisinf(x)) ret = 0;
  
  return ret;
}

static INLINE CONST double toward0(double d) {
  return d == 0 ? 0 : longBitsToDouble(doubleToRawLongBits(d)-1);
}

static INLINE CONST double removelsb(double d) {
  return longBitsToDouble(doubleToRawLongBits(d) & INT64_C(0xfffffffffffffffe));
}

static INLINE CONST double ptrunc(double x) {
  double fr = mla(-(double)(INT64_C(1) << 31), (int32_t)(x * (1.0 / (INT64_C(1) << 31))), x);
  return fabsk(x) >= (double)(INT64_C(1) << 52) ? x : (x - (fr - (int32_t)fr));
}

EXPORT CONST double xfmod(double x, double y) {
  double n = fabsk(x), d = fabsk(y), s = 1, q;
  if (d < DBL_MIN) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; s = 1.0 / (UINT64_C(1) << 54); }
  Sleef_double2 r = dd(n, 0);
  double rd = toward0(1.0 / d);
  
  for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52)
    q = removelsb(ptrunc(toward0(r.x) * rd));
    q = (3*d > r.x && r.x > d) ? 2 : q;
    q = (2*d > r.x && r.x > d) ? 1 : q;
    q = r.x == d ? (r.y >= 0 ? 1 : 0) : q;
    r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d)));
    if (r.x < d) break;
  }
  
  double ret = r.x * s;
  if (r.x + r.y == d) ret = 0;
  ret = mulsign(ret, x);
  if (n < d) ret = x;
  if (d == 0) ret = SLEEF_NAN;
  
  return ret;
}

static INLINE CONST double rintk2(double d) {
  double c = mulsign(INT64_C(1) << 52, d);
  return fabsk(d) > INT64_C(1) << 52 ? d : orsign(d + c - c, d);
}

EXPORT CONST double xremainder(double x, double y) {
  double n = fabsk(x), d = fabsk(y), s = 1, q;
  if (d < DBL_MIN*2) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; s = 1.0 / (UINT64_C(1) << 54); }
  double rd = 1.0 / d;
  Sleef_double2 r = dd(n, 0);
  int qisodd = 0;
  
  for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52)
    q = removelsb(rintk2(r.x * rd));
    if (fabsk(r.x) < 1.5 * d) q = r.x < 0 ? -1 : 1;
    if (fabsk(r.x) < 0.5 * d || (fabsk(r.x) == 0.5 * d && !qisodd)) q = 0;
    if (q == 0) break;
    if (xisinf(q * -d)) q = q + mulsign(-1, r.x);
    qisodd ^= xisodd(q);
    r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d)));
  }
  
  double ret = r.x * s;
  ret = mulsign(ret, x);
  if (xisinf(y)) ret = xisinf(x) ? SLEEF_NAN : x;
  if (d == 0) ret = SLEEF_NAN;

  return ret;
}

EXPORT CONST Sleef_double2 xmodf(double x) {
  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));
  fr = fr - (int32_t)fr;
  fr = fabsk(x) >= (double)(INT64_C(1) << 52) ? 0 : fr;
  Sleef_double2 ret = { copysignk(fr, x), copysignk(x - fr, x) };
  return ret;
}

typedef struct {
  Sleef_double2 a, b;
} dd2;

static CONST dd2 gammak(double a) {
  Sleef_double2 clc = dd(0, 0), clln = dd(1, 0), clld = dd(1, 0), v = dd(1, 0), x, y, z;
  double t, u;

  int otiny = fabsk(a) < 1e-306, oref = a < 0.5;

  x = otiny ? dd(0, 0) : (oref ? ddadd2_d2_d_d(1, -a) : dd(a, 0));

  int o0 = (0.5 <= x.x && x.x <= 1.1), o2 = 2.3 < x.x;

  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 1), x));
  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 2), y));
  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 3), y));
  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 4), y));

  clln = (o2 && x.x <= 7) ? y : clln;

  x = (o2 && x.x <= 7) ? ddadd2_d2_d2_d(x, 5) : x;
  t = o2 ? (1.0 / x.x) : ddnormalize_d2_d2(ddadd2_d2_d2_d(x, o0 ? -1 : -2)).x;
  
  u = o2 ? -156.801412704022726379848862 : (o0 ? +0.2947916772827614196e+2 : +0.7074816000864609279e-7);
  u = mla(u, t, o2 ? +1.120804464289911606838558160000 : (o0 ? +0.1281459691827820109e+3 : +0.4009244333008730443e-6));
  u = mla(u, t, o2 ? +13.39798545514258921833306020000 : (o0 ? +0.2617544025784515043e+3 : +0.1040114641628246946e-5));
  u = mla(u, t, o2 ? -0.116546276599463200848033357000 : (o0 ? +0.3287022855685790432e+3 : +0.1508349150733329167e-5));
  u = mla(u, t, o2 ? -1.391801093265337481495562410000 : (o0 ? +0.2818145867730348186e+3 : +0.1288143074933901020e-5));
  u = mla(u, t, o2 ? +0.015056113040026424412918973400 : (o0 ? +0.1728670414673559605e+3 : +0.4744167749884993937e-6));
  u = mla(u, t, o2 ? +0.179540117061234856098844714000 : (o0 ? +0.7748735764030416817e+2 : -0.6554816306542489902e-7));
  u = mla(u, t, o2 ? -0.002481743600264997730942489280 : (o0 ? +0.2512856643080930752e+2 : -0.3189252471452599844e-6));
  u = mla(u, t, o2 ? -0.029527880945699120504851034100 : (o0 ? +0.5766792106140076868e+1 : +0.1358883821470355377e-6));
  u = mla(u, t, o2 ? +0.000540164767892604515196325186 : (o0 ? +0.7270275473996180571e+0 : -0.4343931277157336040e-6));
  u = mla(u, t, o2 ? +0.006403362833808069794787256200 : (o0 ? +0.8396709124579147809e-1 : +0.9724785897406779555e-6));
  u = mla(u, t, o2 ? -0.000162516262783915816896611252 : (o0 ? -0.8211558669746804595e-1 : -0.2036886057225966011e-5));
  u = mla(u, t, o2 ? -0.001914438498565477526465972390 : (o0 ? +0.6828831828341884458e-1 : +0.4373363141819725815e-5));
  u = mla(u, t, o2 ? +7.20489541602001055898311517e-05 : (o0 ? -0.7712481339961671511e-1 : -0.9439951268304008677e-5));
  u = mla(u, t, o2 ? +0.000839498720672087279971000786 : (o0 ? +0.8337492023017314957e-1 : +0.2050727030376389804e-4));
  u = mla(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? -0.9094964931456242518e-1 : -0.4492620183431184018e-4));
  u = mla(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.1000996313575929358e+0 : +0.9945751236071875931e-4));
  u = mla(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.1113342861544207724e+0 : -0.2231547599034983196e-3));
  u = mla(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1255096673213020875e+0 : +0.5096695247101967622e-3));
  u = mla(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1440498967843054368e+0 : -0.1192753911667886971e-2));
  u = mla(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1695571770041949811e+0 : +0.2890510330742210310e-2));
  u = mla(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2073855510284092762e+0 : -0.7385551028674461858e-2));
  u = mla(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705808084277815939e+0 : +0.2058080842778455335e-1));

  y = ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, -0.5), logk2(x));
  y = ddadd2_d2_d2_d2(y, ddneg_d2_d2(x));
  y = ddadd2_d2_d2_d2(y, dd(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI)

  z = ddadd2_d2_d2_d(ddmul_d2_d_d (u, t), o0 ? -0.4006856343865314862e+0 : -0.6735230105319810201e-1);
  z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? +0.8224670334241132030e+0 : +0.3224670334241132030e+0);
  z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? -0.5772156649015328655e+0 : +0.4227843350984671345e+0);
  z = ddmul_d2_d2_d(z, t);

  clc = o2 ? y : z;
  
  clld = o2 ? ddadd2_d2_d2_d(ddmul_d2_d_d(u, t), 1) : clld;
  
  y = clln;

  clc = otiny ? dd(83.1776616671934334590333, 3.67103459631568507221878e-15) : // log(2^120)
    (oref ? ddadd2_d2_d2_d2(dd(1.1447298858494001639, 1.026595116270782638e-17), ddneg_d2_d2(clc)) : clc); // log(M_PI)
  clln = otiny ? dd(1, 0) : (oref ? clln : clld);

  if (oref) x = ddmul_d2_d2_d2(clld, sinpik(a - (double)(INT64_C(1) << 28) * (int32_t)(a * (1.0 / (INT64_C(1) << 28)))));

  clld = otiny ? dd(a*((INT64_C(1) << 60)*(double)(INT64_C(1) << 60)), 0) : (oref ? x : y);

  dd2 ret = { clc, dddiv_d2_d2_d2(clln, clld) };

  return ret;
}

EXPORT CONST double xtgamma_u1(double a) {
  dd2 d = gammak(a);
  Sleef_double2 y = ddmul_d2_d2_d2(expk2(d.a), d.b);
  double r = y.x + y.y;
  r = (a == -SLEEF_INFINITY || (a < 0 && xisint(a)) || (xisnumber(a) && a < 0 && xisnan(r))) ? SLEEF_NAN : r;
  r = ((a == SLEEF_INFINITY || xisnumber(a)) && a >= -DBL_MIN && (a == 0 || a > 200 || xisnan(r))) ? mulsign(SLEEF_INFINITY, a) : r;
  return r;
}

EXPORT CONST double xlgamma_u1(double a) {
  dd2 d = gammak(a);
  Sleef_double2 y = ddadd2_d2_d2_d2(d.a, logk2(ddabs_d2_d2(d.b)));
  double r = y.x + y.y;
  r = (xisinf(a) || (a <= 0 && xisint(a)) || (xisnumber(a) && xisnan(r))) ? SLEEF_INFINITY : r;
  return r;
}

EXPORT CONST double xerf_u1(double a) {
  double s = a, t, u;
  Sleef_double2 d;

  a = fabsk(a);
  int o0 = a < 1.0, o1 = a < 3.7, o2 = a < 6.0;
  u = o0 ? (a*a) : a;
  
  t = o0 ? +0.6801072401395392157e-20 : o1 ? +0.2830954522087717660e-13 : -0.5846750404269610493e-17;
  t = mla(t, u, o0 ? -0.2161766247570056391e-18 : o1 ? -0.1509491946179481940e-11 : +0.6076691048812607898e-15);
  t = mla(t, u, o0 ? +0.4695919173301598752e-17 : o1 ? +0.3827857177807173152e-10 : -0.3007518609604893831e-13);
  t = mla(t, u, o0 ? -0.9049140419888010819e-16 : o1 ? -0.6139733921558987241e-09 : +0.9427906260824646063e-12);
  t = mla(t, u, o0 ? +0.1634018903557411517e-14 : o1 ? +0.6985387934608038824e-08 : -0.2100110908269393629e-10);
  t = mla(t, u, o0 ? -0.2783485786333455216e-13 : o1 ? -0.5988224513034371474e-07 : +0.3534639523461223473e-09);
  t = mla(t, u, o0 ? +0.4463221276786412722e-12 : o1 ? +0.4005716952355346640e-06 : -0.4664967728285395926e-08);
  t = mla(t, u, o0 ? -0.6711366622850138987e-11 : o1 ? -0.2132190104575784400e-05 : +0.4943823283769000532e-07);
  t = mla(t, u, o0 ? +0.9422759050232658346e-10 : o1 ? +0.9092461304042630325e-05 : -0.4271203394761148254e-06);
  t = mla(t, u, o0 ? -0.1229055530100228477e-08 : o1 ? -0.3079188080966205457e-04 : +0.3034067677404915895e-05);
  t = mla(t, u, o0 ? +0.1480719281585085023e-07 : o1 ? +0.7971413443082370762e-04 : -0.1776295289066871135e-04);
  t = mla(t, u, o0 ? -0.1636584469123402714e-06 : o1 ? -0.1387853215225442864e-03 : +0.8524547630559505050e-04);
  t = mla(t, u, o0 ? +0.1646211436588923363e-05 : o1 ? +0.6469678026257590965e-04 : -0.3290582944961784398e-03);
  t = mla(t, u, o0 ? -0.1492565035840624866e-04 : o1 ? +0.4996645280372945860e-03 : +0.9696966068789101157e-03);
  t = mla(t, u, o0 ? +0.1205533298178966496e-03 : o1 ? -0.1622802482842520535e-02 : -0.1812527628046986137e-02);
  t = mla(t, u, o0 ? -0.8548327023450851166e-03 : o1 ? +0.1615320557049377171e-03 : -0.4725409828123619017e-03);
  t = mla(t, u, o0 ? +0.5223977625442188799e-02 : o1 ? +0.1915262325574875607e-01 : +0.2090315427924229266e-01);
  t = mla(t, u, o0 ? -0.2686617064513125569e-01 : o1 ? -0.1027818298486033455e+00 : -0.1052041921842776645e+00);
  t = mla(t, u, o0 ? +0.1128379167095512753e+00 : o1 ? -0.6366172819842503827e+00 : -0.6345351808766568347e+00);
  t = mla(t, u, o0 ? -0.3761263890318375380e+00 : o1 ? -0.1128379590648910469e+01 : -0.1129442929103524396e+01);
  d = ddmul_d2_d_d(t, u);
  d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) :
		      o1 ? dd(3.4110644736196137587e-08, -2.4875650708323294246e-24) :
		      dd(0.00024963035690526438285, -5.4362665034856259795e-21));
  d = o0 ? ddmul_d2_d2_d(d, a) : ddadd_d2_d_d2(1.0, ddneg_d2_d2(expk2(d)));
  u = mulsign(o2 ? (d.x + d.y) : 1, s);
  u = xisnan(a) ? SLEEF_NAN : u;
  return u;
}

EXPORT CONST double xerfc_u15(double a) {
  double s = a, r = 0, t;
  Sleef_double2 u, d, x;
  a = fabsk(a);
  int o0 = a < 1.0, o1 = a < 2.2, o2 = a < 4.2, o3 = a < 27.3;
  u = o0 ? ddmul_d2_d_d(a, a) : o1 ? dd(a, 0) : dddiv_d2_d2_d2(dd(1, 0), dd(a, 0));

  t = o0 ? +0.6801072401395386139e-20 : o1 ? +0.3438010341362585303e-12 : o2 ? -0.5757819536420710449e+2 : +0.2334249729638701319e+5;
  t = mla(t, u.x, o0 ? -0.2161766247570055669e-18 : o1 ? -0.1237021188160598264e-10 : o2 ? +0.4669289654498104483e+3 : -0.4695661044933107769e+5);
  t = mla(t, u.x, o0 ? +0.4695919173301595670e-17 : o1 ? +0.2117985839877627852e-09 : o2 ? -0.1796329879461355858e+4 : +0.3173403108748643353e+5);
  t = mla(t, u.x, o0 ? -0.9049140419888007122e-16 : o1 ? -0.2290560929177369506e-08 : o2 ? +0.4355892193699575728e+4 : +0.3242982786959573787e+4);
  t = mla(t, u.x, o0 ? +0.1634018903557410728e-14 : o1 ? +0.1748931621698149538e-07 : o2 ? -0.7456258884965764992e+4 : -0.2014717999760347811e+5);
  t = mla(t, u.x, o0 ? -0.2783485786333451745e-13 : o1 ? -0.9956602606623249195e-07 : o2 ? +0.9553977358167021521e+4 : +0.1554006970967118286e+5);
  t = mla(t, u.x, o0 ? +0.4463221276786415752e-12 : o1 ? +0.4330010240640327080e-06 : o2 ? -0.9470019905444229153e+4 : -0.6150874190563554293e+4);
  t = mla(t, u.x, o0 ? -0.6711366622850136563e-11 : o1 ? -0.1435050600991763331e-05 : o2 ? +0.7387344321849855078e+4 : +0.1240047765634815732e+4);
  t = mla(t, u.x, o0 ? +0.9422759050232662223e-10 : o1 ? +0.3460139479650695662e-05 : o2 ? -0.4557713054166382790e+4 : -0.8210325475752699731e+2);
  t = mla(t, u.x, o0 ? -0.1229055530100229098e-08 : o1 ? -0.4988908180632898173e-05 : o2 ? +0.2207866967354055305e+4 : +0.3242443880839930870e+2);
  t = mla(t, u.x, o0 ? +0.1480719281585086512e-07 : o1 ? -0.1308775976326352012e-05 : o2 ? -0.8217975658621754746e+3 : -0.2923418863833160586e+2);
  t = mla(t, u.x, o0 ? -0.1636584469123399803e-06 : o1 ? +0.2825086540850310103e-04 : o2 ? +0.2268659483507917400e+3 : +0.3457461732814383071e+0);
  t = mla(t, u.x, o0 ? +0.1646211436588923575e-05 : o1 ? -0.6393913713069986071e-04 : o2 ? -0.4633361260318560682e+2 : +0.5489730155952392998e+1);
  t = mla(t, u.x, o0 ? -0.1492565035840623511e-04 : o1 ? -0.2566436514695078926e-04 : o2 ? +0.9557380123733945965e+1 : +0.1559934132251294134e-2);
  t = mla(t, u.x, o0 ? +0.1205533298178967851e-03 : o1 ? +0.5895792375659440364e-03 : o2 ? -0.2958429331939661289e+1 : -0.1541741566831520638e+1);
  t = mla(t, u.x, o0 ? -0.8548327023450850081e-03 : o1 ? -0.1695715579163588598e-02 : o2 ? +0.1670329508092765480e+0 : +0.2823152230558364186e-5);
  t = mla(t, u.x, o0 ? +0.5223977625442187932e-02 : o1 ? +0.2089116434918055149e-03 : o2 ? +0.6096615680115419211e+0 : +0.6249999184195342838e+0);
  t = mla(t, u.x, o0 ? -0.2686617064513125222e-01 : o1 ? +0.1912855949584917753e-01 : o2 ? +0.1059212443193543585e-2 : +0.1741749416408701288e-8);
  
  d = ddmul_d2_d2_d(u, t);
  d = ddadd2_d2_d2_d2(d, o0 ? dd(0.11283791670955126141, -4.0175691625932118483e-18) :
		      o1 ? dd(-0.10277263343147646779, -6.2338714083404900225e-18) :
		      o2 ? dd(-0.50005180473999022439, 2.6362140569041995803e-17) :
		      dd(-0.5000000000258444377, -4.0074044712386992281e-17));
  d = ddmul_d2_d2_d2(d, u);
  d = ddadd2_d2_d2_d2(d, o0 ? dd(-0.37612638903183753802, 1.3391897206042552387e-17) :
		      o1 ? dd(-0.63661976742916359662, 7.6321019159085724662e-18) :
		      o2 ? dd(1.601106273924963368e-06, 1.1974001857764476775e-23) :
		      dd(2.3761973137523364792e-13, -1.1670076950531026582e-29));
  d = ddmul_d2_d2_d2(d, u);
  d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) :
		      o1 ? dd(-1.1283791674717296161, 8.0896847755965377194e-17) :
		      o2 ? dd(-0.57236496645145429341, 3.0704553245872027258e-17) :
		      dd(-0.57236494292470108114, -2.3984352208056898003e-17));

  x = ddmul_d2_d2_d(o1 ? d : dd(-a, 0), a);
  x = o1 ? x : ddadd2_d2_d2_d2(x, d);
  x = o0 ? ddsub_d2_d2_d2(dd(1, 0), x) : expk2(x);
  x = o1 ? x : ddmul_d2_d2_d2(x, u);

  r = o3 ? (x.x + x.y) : 0;
  if (s < 0) r = 2 - r;
  r = xisnan(s) ? SLEEF_NAN : r;
  return r;
}

#ifdef ENABLE_MAIN
// gcc -w -DENABLE_MAIN -I../common sleefdp.c rempitab.c -lm
#include <stdlib.h>
int main(int argc, char **argv) {
  double d1 = atof(argv[1]);
  printf("arg1 = %.20g\n", d1);
  //int i1 = atoi(argv[1]);
  //double d2 = atof(argv[2]);
  //printf("arg2 = %.20g\n", d2);
  //printf("%d\n", (int)d2);
#if 0
  double d3 = atof(argv[3]);
  printf("arg3 = %.20g\n", d3);
#endif
  //printf("%g\n", pow2i(i1));
  //int exp = xexpfrexp(d1);
  //double r = xnextafter(d1, d2);
  //double r = xfma(d1, d2, d3);
  printf("test = %.20g\n", xcos_u1(d1));
  //printf("test = %.20g\n", xlog(d1));
  //r = nextafter(d1, d2);
  printf("corr = %.20g\n", cos(d1));
  //printf("%.20g %.20g\n", xround(d1), xrint(d1));
  //Sleef_double2 r = xsincospi_u35(d);
  //printf("%g, %g\n", (double)r.x, (double)r.y);
}
#endif


================================================
FILE: src/sleefsimddp.c
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

// Always use -ffp-contract=off option to compile SLEEF.

#if !defined(SLEEF_GENHEADER)
#include <stdint.h>
#include <assert.h>
#include <limits.h>
#include <float.h>
#endif

#include "misc.h"

extern const double Sleef_rempitabdp[];

#define __SLEEFSIMDDP_C__

#if (defined(_MSC_VER))
#pragma fp_contract (off)
#endif

// Intel

#ifdef ENABLE_SSE2
#define CONFIG 2
#include "helpersse2.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renamesse2_gnuabi.h"
#else
#include "renamesse2.h"
#endif
#endif
#endif

#ifdef ENABLE_SSE4
#define CONFIG 4
#include "helpersse2.h"
#ifdef DORENAME
#include "renamesse4.h"
#endif
#endif

#ifdef ENABLE_AVX
#define CONFIG 1
#include "helperavx.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameavx_gnuabi.h"
#else
#include "renameavx.h"
#endif
#endif
#endif

#ifdef ENABLE_FMA4
#define CONFIG 4
#include "helperavx.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renamefma4_gnuabi.h"
#else
#include "renamefma4.h"
#endif
#endif
#endif

#ifdef ENABLE_AVX2
#define CONFIG 1
#include "helperavx2.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameavx2_gnuabi.h"
#else
#include "renameavx2.h"
#endif
#endif
#endif

#ifdef ENABLE_AVX2128
#define CONFIG 1
#include "helperavx2_128.h"
#ifdef DORENAME
#include "renameavx2128.h"
#endif
#endif

#ifdef ENABLE_AVX512F
#define CONFIG 1
#include "helperavx512f.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameavx512f_gnuabi.h"
#else
#include "renameavx512f.h"
#endif
#endif
#endif

#ifdef ENABLE_AVX512FNOFMA
#define CONFIG 2
#include "helperavx512f.h"
#ifdef DORENAME
#include "renameavx512fnofma.h"
#endif
#endif

// Arm

#ifdef ENABLE_ADVSIMD
#define CONFIG 1
#include "helperadvsimd.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameadvsimd_gnuabi.h"
#else
#include "renameadvsimd.h"
#endif
#endif
#endif

#ifdef ENABLE_ADVSIMDNOFMA
#define CONFIG 2
#include "helperadvsimd.h"
#ifdef DORENAME
#include "renameadvsimdnofma.h"
#endif
#endif

#ifdef ENABLE_SVE
#define CONFIG 1
#include "helpersve.h"
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renamesve_gnuabi.h"
#else
#include "renamesve.h"
#endif /* ENABLE_GNUABI */
#endif /* DORENAME */
#endif /* ENABLE_SVE */

#ifdef ENABLE_SVENOFMA
#define CONFIG 2
#include "helpersve.h"
#ifdef DORENAME
#include "renamesvenofma.h"
#endif /* DORENAME */
#endif /* ENABLE_SVE */

// IBM

#ifdef ENABLE_VSX
#define CONFIG 1
#include "helperpower_128.h"
#ifdef DORENAME
#include "renamevsx.h"
#endif
#endif

#ifdef ENABLE_VSXNOFMA
#define CONFIG 2
#include "helperpower_128.h"
#ifdef DORENAME
#include "renamevsxnofma.h"
#endif
#endif

#ifdef ENABLE_ZVECTOR2
#define CONFIG 140
#include "helpers390x_128.h"
#ifdef DORENAME
#include "renamezvector2.h"
#endif
#endif

#ifdef ENABLE_ZVECTOR2NOFMA
#define CONFIG 141
#include "helpers390x_128.h"
#ifdef DORENAME
#include "renamezvector2nofma.h"
#endif
#endif

// Generic

#ifdef ENABLE_VECEXT
#define CONFIG 1
#include "helpervecext.h"
#ifdef DORENAME
#include "renamevecext.h"
#endif
#endif

#ifdef ENABLE_PUREC
#define CONFIG 1
#include "helperpurec.h"
#ifdef DORENAME
#include "renamepurec.h"
#endif
#endif

#ifdef ENABLE_PUREC_SCALAR
#define CONFIG 1
#include "helperpurec_scalar.h"
#ifdef DORENAME
#include "renamepurec_scalar.h"
#endif
#endif

#ifdef ENABLE_PURECFMA_SCALAR
#define CONFIG 2
#include "helperpurec_scalar.h"
#ifdef DORENAME
#include "renamepurecfma_scalar.h"
#endif
#endif

//

#define MLA(x, y, z) vmla_vd_vd_vd_vd((x), (y), (z))
#define C2V(c) vcast_vd_d(c)
#include "estrin.h"

//

#include "dd.h"

//

static INLINE VECTOR_CC vopmask vnot_vo64_vo64(vopmask x) {
  return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i_i(0, 0), vcast_vm_i_i(0, 0)));
}

static INLINE CONST VECTOR_CC vopmask vsignbit_vo_vd(vdouble d) {
  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
}

// return d0 < d1 ? x : y
static INLINE CONST VECTOR_CC vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { return vsel_vi_vo_vi_vi(vcast_vo32_vo64(vlt_vo_vd_vd(d0, d1)), x, y); } 

// return d0 < 0 ? x : 0
static INLINE CONST VECTOR_CC vint vsel_vi_vd_vi(vdouble d, vint x) { return vand_vi_vo_vi(vcast_vo32_vo64(vsignbit_vo_vd(d)), x); }

static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) {
  return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
}

static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) {
  return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));
}

static INLINE CONST VECTOR_CC vmask vsignbit_vm_vd(vdouble d) {
  return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
}

static INLINE CONST VECTOR_CC vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {
  return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
}

static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {
  return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)), 
					  vand_vm_vm_vm   (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));
}

static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) {
  return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);
}

static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) {
  q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);
  vint2 r = vcastu_vi2_vi(q);
  return vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20));
}

static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) {
  vint m = vsra_vi_vi_i(q, 31);
  m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);
  q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));
  m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);
  m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);
  m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);
  vint2 r = vcastu_vi2_vi(m);
  vdouble y = vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20));
  return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));
}

static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {
  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));
}

static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {
  return vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vd(d), vsll_vi2_vi2_i(vcastu_vi2_vi(q), 20)));
}

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) {
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
  vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d));
  q = vand_vi_vi_vi(q, vcast_vi_i(((1 << 12)-1) << 20));
  q = vsrl_vi_vi_i(q, 20);
  q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));
  return q;
}

static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) {
  vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d));
  q = vsrl_vi_vi_i(q, 20);
  q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));
  q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));
  return q;
}
#endif

static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) {
#ifdef FULL_FP_ROUNDING
  return veq_vo_vd_vd(vtruncate_vd_vd(d), d);
#else
  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (INT64_C(1) << 31))));
  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), x, d);
  return vor_vo_vo_vo(veq_vo_vd_vd(vtruncate_vd_vd(x), x),
		      vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 53)));
#endif
}

static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) {
#ifdef FULL_FP_ROUNDING
  vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5));
  return vneq_vo_vd_vd(vtruncate_vd_vd(x), x);
#else
  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (INT64_C(1) << 31))));
  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), x, d);

  return vand_vo_vo_vo(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vtruncate_vi_vd(x), vcast_vi_i(1)), vcast_vi_i(1))),
		       vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 53)));
#endif
}

//

EXPORT CONST VECTOR_CC vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); }

EXPORT CONST VECTOR_CC vint xilogb(vdouble d) {
  vdouble e = vcast_vd_vi(vilogbk_vi_vd(vabs_vd_vd(d)));
  e = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_FP_ILOGB0), e);
  e = vsel_vd_vo_vd_vd(visnan_vo_vd(d), vcast_vd_d(SLEEF_FP_ILOGBNAN), e);
  e = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(INT_MAX), e);
  return vrint_vi_vd(e);
}

#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))
typedef struct {
  vdouble d;
  vint i;
} di_t;

static vdouble digetd_vd_di(di_t d) { return d.d; }
static vint digeti_vi_di(di_t d) { return d.i; }
static di_t disetdi_di_vd_vi(vdouble d, vint i) {
  di_t r = { d, i };
  return r;
}

typedef struct {
  vdouble2 dd;
  vint i;
} ddi_t;

static vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; }
static vint ddigeti_vi_ddi(ddi_t d) { return d.i; }
static ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {
  ddi_t r = { v, i };
  return r;
}
static ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {
  ddi.dd = v;
  return ddi;
}
#endif

static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) {
  return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
}

static INLINE CONST di_t rempisub(vdouble x) {
#ifdef FULL_FP_ROUNDING
  vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4)));
  vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4))));
  return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi);
#else
  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x);
  vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)),
				    vmul_vd_vd_vd(vcast_vd_d(4), x),
				    vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x));
  vdouble rintx  = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)),
				    x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x));
  return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x),
			  vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x)));
#endif
}

static INLINE CONST ddi_t rempi(vdouble a) {
  vdouble2 x, y, z;
  vint ex = vilogb2k_vi_vd(a);
#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  ex = vandnot_vi_vi_vi(vsra_vi_vi_i(ex, 31), ex);
  ex = vand_vi_vi_vi(ex, vcast_vi_i(1023));
#endif
  ex = vsub_vi_vi_vi(ex, vcast_vi_i(55));
  vint q = vand_vi_vo_vi(vgt_vo_vi_vi(ex, vcast_vi_i(700-55)), vcast_vi_i(-64));
  a = vldexp3_vd_vd_vi(a, q);
  ex = vandnot_vi_vi_vi(vsra_vi_vi_i(ex, 31), ex);
  ex = vsll_vi_vi_i(ex, 2);
  x = ddmul_vd2_vd_vd(a, vgather_vd_p_vi(Sleef_rempitabdp, ex));
  di_t di = rempisub(vd2getx_vd_vd2(x));
  q = digeti_vi_di(di);
  x = vd2setx_vd2_vd2_vd(x, digetd_vd_di(di));
  x = ddnormalize_vd2_vd2(x);
  y = ddmul_vd2_vd_vd(a, vgather_vd_p_vi(Sleef_rempitabdp+1, ex));
  x = ddadd2_vd2_vd2_vd2(x, y);
  di = rempisub(vd2getx_vd_vd2(x));
  q = vadd_vi_vi_vi(q, digeti_vi_di(di));
  x = vd2setx_vd2_vd2_vd(x, digetd_vd_di(di));
  x = ddnormalize_vd2_vd2(x);
  y = vcast_vd2_vd_vd(vgather_vd_p_vi(Sleef_rempitabdp+2, ex), vgather_vd_p_vi(Sleef_rempitabdp+3, ex));
  y = ddmul_vd2_vd2_vd(y, a);
  x = ddadd2_vd2_vd2_vd2(x, y);
  x = ddnormalize_vd2_vd2(x);
  x = ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(3.141592653589793116*2, 1.2246467991473532072e-16*2));
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(0.7));
  x = vd2setx_vd2_vd2_vd(x, vsel_vd_vo_vd_vd(o, a, vd2getx_vd_vd2(x)));
  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));
  return ddisetddi_ddi_vd2_vi(x, q);
}

EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) {
#if !defined(DETERMINISTIC)
// The SIMD source files(sleefsimd?p.c) are compiled twice for each
// vector extension, with DETERMINISTIC macro turned on and off.
// Below is the normal(faster) implementation of sin function.  The
// function name xsin will be renamed to Sleef_sind2_u35sse2 with
// renamesse2.h, for example.

  vdouble u, s, r = d;
  vint ql;

  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
    ql = vrint_vi_vd(dql);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2), d);
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));
    ql = vrint_vi_vd(dql);

    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), d);
    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), d);
    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), d);
    d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), d);
  } else {
    ddi_t ddi = rempi(d);
    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));
    ql = vsra_vi_vi_i(ql, 2);
    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));
    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), 
				 vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));
    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);
    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));
    d = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
    d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));
  }

  s = vmul_vd_vd_vd(d, d);

  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(s, s2, s4,
	    -7.97255955009037868891952e-18,
	    2.81009972710863200091251e-15,
	    -7.64712219118158833288484e-13,
	    1.60590430605664501629054e-10,
	    -2.50521083763502045810755e-08,
	    2.75573192239198747630416e-06,
	    -0.000198412698412696162806809,
	    0.00833333333333332974823815);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));

  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);

  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(r), r, u);
  
  return u;

#else // #if !defined(DETERMINISTIC)

// This is the deterministic implementation of sin function. Returned
// values from deterministic functions are bitwise consistent across
// all platforms. The function name xsin will be renamed to
// Sleef_cinz_sind2_u35sse2 with renamesse2.h, for example. The
// renaming by rename*.h is switched according to DETERMINISTIC macro.
  vdouble u, s, r = d;
  vint ql;

  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
  ql = vrint_vi_vd(dql);
  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);
  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2), d);
  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX2));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(r, vcast_vd_d(M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(r, vcast_vd_d(M_1_PI), dqh));

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), r);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), u);
    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), u);

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));
    d = vsel_vd_vo_vd_vd(g, d, u);
    g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(r);
      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));
      ql2 = vsra_vi_vi_i(ql2, 2);
      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));
      vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), 
				   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));
      x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);
      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));
      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
      d = vsel_vd_vo_vd_vd(g, d, u);
      d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));
    }
  }

  s = vmul_vd_vd_vd(d, d);

  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(s, s2, s4,
	    -7.97255955009037868891952e-18,
	    2.81009972710863200091251e-15,
	    -7.64712219118158833288484e-13,
	    1.60590430605664501629054e-10,
	    -2.50521083763502045810755e-08,
	    2.75573192239198747630416e-06,
	    -0.000198412698412696162806809,
	    0.00833333333333332974823815);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));

  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);

  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(r), r, u);
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) {
#if !defined(DETERMINISTIC)
  vdouble u;
  vdouble2 s, t, x;
  vint ql;
  
  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
    ql = vrint_vi_vd(dql);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);
    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2)));
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));
    ql = vrint_vi_vd(dql);

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);
    s = ddadd_vd2_vd_vd  (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C)));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D)));
  } else {
    ddi_t ddi = rempi(d);
    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));
    ql = vsra_vi_vi_i(ql, 2);
    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));
    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), 
				 vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));
    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);
    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));
    s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));
    s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(s)))));
  }
  
  t = s;
  s = ddsqu_vd2_vd2(s);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY6(vd2getx_vd_vd2(s), s2, s4,
	    2.72052416138529567917983e-15,
	    -7.6429259411395447190023e-13,
	    1.60589370117277896211623e-10,
	    -2.5052106814843123359368e-08,
	    2.75573192104428224777379e-06,
	    -0.000198412698412046454654947);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));

  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));
  u = ddmul_vd_vd2_vd2(t, x);
  
  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))),
						       vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));
  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);
  
  return u;

#else // #if !defined(DETERMINISTIC)

  vdouble u;
  vdouble2 s, t, x;
  vint ql;

  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));
  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
  ql = vrint_vi_vd(dql);
  u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);
  x = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2)));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);
    s = ddadd_vd2_vd_vd  (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C)));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D)));

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));
    x = vsel_vd2_vo_vd2_vd2(g, x, s);
    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(d);
      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));
      ql2 = vsra_vi_vi_i(ql2, 2);
      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));
      vdouble2 t = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), 
				   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));
      t = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), t);
      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), t, ddigetdd_vd2_ddi(ddi)));
      s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));
      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
      x = vsel_vd2_vo_vd2_vd2(g, x, s);
      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
    }
  }
  
  t = x;
  s = ddsqu_vd2_vd2(x);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY6(vd2getx_vd_vd2(s), s2, s4,
	    2.72052416138529567917983e-15,
	    -7.6429259411395447190023e-13,
	    1.60589370117277896211623e-10,
	    -2.5052106814843123359368e-08,
	    2.75573192104428224777379e-06,
	    -0.000198412698412046454654947);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));

  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));
  u = ddmul_vd_vd2_vd2(t, x);

  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))),
						       vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));

  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) {
#if !defined(DETERMINISTIC)
  vdouble u, s, r = d;
  vint ql;

  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    vdouble dql = vmla_vd_vd_vd_vd(vcast_vd_d(2),
				   vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))),
				   vcast_vd_d(1));
    ql = vrint_vi_vd(dql);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), d);
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));
    ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
				   vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
    vdouble dql = vcast_vd_vi(ql);

    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), d);
    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), d);
    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), d);
    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), d);
    d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), d);
  } else {
    ddi_t ddi = rempi(d);
    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));
    ql = vsra_vi_vi_i(ql, 1);
    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));
    vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));
    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), 
				 vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));
    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);
    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));
    d = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
    d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));
  }

  s = vmul_vd_vd_vd(d, d);

  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(s, s2, s4,
	    -7.97255955009037868891952e-18,
	    2.81009972710863200091251e-15,
	    -7.64712219118158833288484e-13,
	    1.60590430605664501629054e-10,
	    -2.50521083763502045810755e-08,
	    2.75573192239198747630416e-06,
	    -0.000198412698412696162806809,
	    0.00833333333333332974823815);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));

  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);
  
  return u;

#else // #if !defined(DETERMINISTIC)

  vdouble u, s, r = d;
  vint ql;

  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));
  vdouble dql = vmla_vd_vd_vd_vd(vcast_vd_d(2),
				 vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))),
				 vcast_vd_d(1));
  ql = vrint_vi_vd(dql);
  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);
  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), d);

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(r, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));
    vint ql2 = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(r, vcast_vd_d(M_1_PI)),
					 vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vcast_vi_i(1));
    vdouble dql = vcast_vd_vi(ql2);

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), r);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u);
    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u);

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
    d = vsel_vd_vo_vd_vd(g, d, u);
    g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(r);
      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));
      ql2 = vsra_vi_vi_i(ql2, 1);
      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));
      vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));
      vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), 
				   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));
      x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);
      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));
      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
      d = vsel_vd_vo_vd_vd(g, d, u);
      d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));
    }
  }

  s = vmul_vd_vd_vd(d, d);

  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(s, s2, s4,
	    -7.97255955009037868891952e-18,
	    2.81009972710863200091251e-15,
	    -7.64712219118158833288484e-13,
	    1.60590430605664501629054e-10,
	    -2.50521083763502045810755e-08,
	    2.75573192239198747630416e-06,
	    -0.000198412698412696162806809,
	    0.00833333333333332974823815);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));

  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) {
#if !defined(DETERMINISTIC)
  vdouble u;
  vdouble2 s, t, x;
  vint ql;
  
  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
    dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1));
    ql = vrint_vi_vd(dql);
    s = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5)));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));
    ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
					vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
    const vdouble dql = vcast_vd_vi(ql);

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));
  } else {
    ddi_t ddi = rempi(d);
    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));
    ql = vsra_vi_vi_i(ql, 1);
    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));
    vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));
    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), 
				 vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));
    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);
    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));
    s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));
    s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(s)))));
  }
  
  t = s;
  s = ddsqu_vd2_vd2(s);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY6(vd2getx_vd_vd2(s), s2, s4,
	    2.72052416138529567917983e-15,
	    -7.6429259411395447190023e-13,
	    1.60589370117277896211623e-10,
	    -2.5052106814843123359368e-08,
	    2.75573192104428224777379e-06,
	    -0.000198412698412046454654947);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));

  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));
  u = ddmul_vd_vd2_vd2(t, x);
  
  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));
  
  return u;

#else // #if !defined(DETERMINISTIC)

  vdouble u;
  vdouble2 s, t, x;
  vint ql;

  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));
  vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
  dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1));
  ql = vrint_vi_vd(dql);
  x = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5)));
  x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));
    vint ql2 = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
					 vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vcast_vi_i(1));
    const vdouble dql = vcast_vd_vi(ql2);

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
    x = vsel_vd2_vo_vd2_vd2(g, x, s);
    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(d);
      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));
      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));
      ql2 = vsra_vi_vi_i(ql2, 1);
      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));
      vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));
      vdouble2 t = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), 
				   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));
      t = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), t);
      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), t, ddigetdd_vd2_ddi(ddi)));
      s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));
      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
      x = vsel_vd2_vo_vd2_vd2(g, x, s);
      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
    }
  }
  
  t = x;
  s = ddsqu_vd2_vd2(x);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY6(vd2getx_vd_vd2(s), s2, s4,
	    2.72052416138529567917983e-15,
	    -7.6429259411395447190023e-13,
	    1.60589370117277896211623e-10,
	    -2.5052106814843123359368e-08,
	    2.75573192104428224777379e-06,
	    -0.000198412698412046454654947);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));

  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));
  u = ddmul_vd_vd2_vd2(t, x);
  
  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

#ifdef ENABLE_GNUABI
#define TYPE2_FUNCATR static INLINE CONST 
#define TYPE6_FUNCATR static INLINE CONST 
#define SQRTU05_FUNCATR static INLINE CONST 
#define XSINCOS sincosk
#define XSINCOS_U1 sincosk_u1
#define XSINCOSPI_U05 sincospik_u05
#define XSINCOSPI_U35 sincospik_u35
#define XMODF modfk
#else
#define TYPE2_FUNCATR EXPORT
#define TYPE6_FUNCATR EXPORT CONST
#define SQRTU05_FUNCATR EXPORT CONST
#define XSINCOS xsincos
#define XSINCOS_U1 xsincos_u1
#define XSINCOSPI_U05 xsincospi_u05
#define XSINCOSPI_U35 xsincospi_u35
#define XMODF xmodf
#endif

TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS(vdouble d) {
#if !defined(DETERMINISTIC)
  vopmask o;
  vdouble u, t, rx, ry, s;
  vdouble2 r;
  vint ql;

  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
    ql = vrint_vi_vd(dql);
    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);
    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s);
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
    ql = vrint_vi_vd(dql);

    s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), s);
    s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), s);
    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), s);
    s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), s);
    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), s);
    s = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), s);
  } else {
    ddi_t ddi = rempi(d);
    ql = ddigeti_vi_ddi(ddi);
    s = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
    s = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(s)));
  }
  
  t = s;

  s = vmul_vd_vd_vd(s, s);

  u = vcast_vd_d(1.58938307283228937328511e-10);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393));

  rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t);
  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);

  u = vcast_vd_d(-1.13615350239097429531523e-11);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5));

  ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));
  
  return r;

#else // #if !defined(DETERMINISTIC)

  vopmask o;
  vdouble u, t, rx, ry, s = d;
  vdouble2 r;
  vint ql;

  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(s, vcast_vd_d(2 * M_1_PI)));
  ql = vrint_vi_vd(dql);
  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), s);
  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s);
  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u);
    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u);

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));
    s = vsel_vd_vo_vd_vd(g, s, u);
    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(d);
      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
      u = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(u)));

      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi));
      s = vsel_vd_vo_vd_vd(g, s, u);
    }
  }
  
  t = s;

  s = vmul_vd_vd_vd(s, s);

  u = vcast_vd_d(1.58938307283228937328511e-10);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393));

  rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t);
  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);

  u = vcast_vd_d(-1.13615350239097429531523e-11);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5));

  ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));
  
  return r;
#endif // #if !defined(DETERMINISTIC)
}

TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS_U1(vdouble d) {
#if !defined(DETERMINISTIC)
  vopmask o;
  vdouble u, rx, ry;
  vdouble2 r, s, t, x;
  vint ql;
  
  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
    ql = vrint_vi_vd(dql);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);
    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
    ql = vrint_vi_vd(dql);
    
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));
  } else {
    ddi_t ddi = rempi(d);
    ql = ddigeti_vi_ddi(ddi);
    s = ddigetdd_vd2_ddi(ddi);
    o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));
    s = vd2setxy_vd2_vd_vd(vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(s)))),
			   vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(s)))));
  }
  
  t = s;

  s = vd2setx_vd2_vd2_vd(s, ddsqu_vd_vd2(s));
  
  u = vcast_vd_d(1.58938307283228937328511e-10);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.50506943502539773349318e-08));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.75573131776846360512547e-06));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.000198412698278911770864914));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0083333333333191845961746));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.166666666666666130709393));

  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(t)));

  x = ddadd_vd2_vd2_vd(t, u);
  rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);
  
  u = vcast_vd_d(-1.13615350239097429531523e-11);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.08757471207040055479366e-09));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.75573144028847567498567e-07));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.48015872890001867311915e-05));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.00138888888888714019282329));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0416666666666665519592062));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.5));

  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(vd2getx_vd_vd2(s), u));
  ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  return r;

#else // #if !defined(DETERMINISTIC)

  vopmask o;
  vdouble u, rx, ry;
  vdouble2 r, s, t, x;
  vint ql;
  
  const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
  ql = vrint_vi_vd(dql);
  u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);
  s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
    
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    x = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));
    x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));
    s = vsel_vd2_vo_vd2_vd2(g, s, x);
    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(d);
      x = ddigetdd_vd2_ddi(ddi);
      o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));
      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
      x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));

      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi));
      s = vsel_vd2_vo_vd2_vd2(g, s, x);
    }
  }
  
  t = s;

  s = vd2setx_vd2_vd2_vd(s, ddsqu_vd_vd2(s));
  
  u = vcast_vd_d(1.58938307283228937328511e-10);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.50506943502539773349318e-08));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.75573131776846360512547e-06));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.000198412698278911770864914));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0083333333333191845961746));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.166666666666666130709393));

  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(t)));

  x = ddadd_vd2_vd2_vd(t, u);
  rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);
  
  u = vcast_vd_d(-1.13615350239097429531523e-11);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.08757471207040055479366e-09));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.75573144028847567498567e-07));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.48015872890001867311915e-05));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.00138888888888714019282329));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0416666666666665519592062));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.5));

  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(vd2getx_vd_vd2(s), u));
  ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  return r;
#endif // #if !defined(DETERMINISTIC)
}

#if !defined(DETERMINISTIC)
TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U05(vdouble d) {
  vopmask o;
  vdouble u, s, t, rx, ry;
  vdouble2 r, x, s2;

  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
  vint q = vtruncate_vi_vd(u);
  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
  
  t = s;
  s = vmul_vd_vd_vd(s, s);
  s2 = ddmul_vd2_vd_vd(t, t);
  
  //

  u = vcast_vd_d(-2.02461120785182399295868e-14);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(6.94821830580179461327784e-12));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-1.75724749952853179952664e-09));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.13361688966868392878422e-07));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.6576204182161551920361e-05));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00249039457019271850274356));
  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(0.785398163397448278999491, 3.06287113727155002607105e-17));

  x = ddmul_vd2_vd2_vd(x, t);
  rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);
  
  //
  
  u = vcast_vd_d(9.94480387626843774090208e-16);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.89796226062932799164047e-13));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.15011582539996035266901e-10));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.4611369501044697495359e-08));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.59086044859052754005062e-06));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000325991886927389905997954));
  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(0.0158543442438155018914259, -1.04693272280631521908845e-18));
  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(-0.308425137534042437259529, -1.95698492133633550338345e-17));

  x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x, s2), vcast_vd_d(1));
  ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  //

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));
  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));
  r = vd2sety_vd2_vd2_vd(r, vsel_vd_vo_vd_vd(o, vcast_vd_d(1), vd2gety_vd_vd2(r)));

  o = visinf_vo_vd(d);
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  return r;
}

TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U35(vdouble d) {
  vopmask o;
  vdouble u, s, t, rx, ry;
  vdouble2 r;

  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
  vint q = vtruncate_vi_vd(u);
  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));

  t = s;
  s = vmul_vd_vd_vd(s, s);
  
  //

  u = vcast_vd_d(+0.6880638894766060136e-11);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.1757159564542310199e-8));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3133616327257867311e-6));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3657620416388486452e-4));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2490394570189932103e-2));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.8074551218828056320e-1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.7853981633974482790e+0));

  rx = vmul_vd_vd_vd(u, t);

  //
  
  u = vcast_vd_d(-0.3860141213683794352e-12);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1150057888029681415e-9));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.2461136493006663553e-7));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3590860446623516713e-5));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3259918869269435942e-3));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1585434424381541169e-1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3084251375340424373e+0));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));

  ry = u;

  //

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));
  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4));
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  o = visinf_vo_vd(d);
  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));
  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));

  return r;
}

TYPE6_FUNCATR VECTOR_CC vdouble2 XMODF(vdouble x) {
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  fr = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), vcast_vd_d(0), fr);

  vdouble2 ret;

  ret = vd2setxy_vd2_vd_vd(vcopysign_vd_vd_vd(fr, x), vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));

  return ret;
}

#ifdef ENABLE_GNUABI
EXPORT VECTOR_CC void xsincos(vdouble a, double *ps, double *pc) {
  vdouble2 r = sincosk(a);
  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));
  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));
}

EXPORT VECTOR_CC void xsincos_u1(vdouble a, double *ps, double *pc) {
  vdouble2 r = sincosk_u1(a);
  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));
  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));
}

EXPORT VECTOR_CC void xsincospi_u05(vdouble a, double *ps, double *pc) {
  vdouble2 r = sincospik_u05(a);
  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));
  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));
}

EXPORT VECTOR_CC void xsincospi_u35(vdouble a, double *ps, double *pc) {
  vdouble2 r = sincospik_u35(a);
  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));
  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));
}

EXPORT CONST VECTOR_CC vdouble xmodf(vdouble a, double *iptr) {
  vdouble2 r = modfk(a);
  vstoreu_v_p_vd(iptr, vd2gety_vd_vd2(r));
  return vd2getx_vd_vd2(r);
}
#endif // #ifdef ENABLE_GNUABI
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vdouble2 sinpik(vdouble d) {
  vopmask o;
  vdouble u, s, t;
  vdouble2 x, s2;

  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
  vint q = vtruncate_vi_vd(u);
  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)));

  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
  t = s;
  s = vmul_vd_vd_vd(s, s);
  s2 = ddmul_vd2_vd_vd(t, t);

  //

  u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14);
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600));
  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s),
			vsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18,
					    -0.0807455121882807852484731, 3.61852475067037104849987e-18));
  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x),
			 vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17,
					     0.785398163397448278999491, 3.06287113727155002607105e-17));

  x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0))));
  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x);

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));
  x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));

  return x;
}

EXPORT CONST VECTOR_CC vdouble xsinpi_u05(vdouble d) {
  vdouble2 x = sinpik(d);
  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r);
  r = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vreinterpret_vm_vd(r)));
  r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r)));
  
  return r;
}

static INLINE CONST VECTOR_CC vdouble2 cospik(vdouble d) {
  vopmask o;
  vdouble u, s, t;
  vdouble2 x, s2;

  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
  vint q = vtruncate_vi_vd(u);
  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));

  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
  t = s;
  s = vmul_vd_vd_vd(s, s);
  s2 = ddmul_vd2_vd_vd(t, t);
  
  //

  u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14);
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05));
  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600));
  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s),
			vsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18,
					    -0.0807455121882807852484731, 3.61852475067037104849987e-18));
  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x),
			 vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17,
					     0.785398163397448278999491, 3.06287113727155002607105e-17));

  x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0))));
  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x);

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));
  x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));

  return x;
}

EXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) {
  vdouble2 x = cospik(d);
  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vcast_vd_d(1), r);
  r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r)));
  
  return r;
}

EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) {
#if !defined(DETERMINISTIC)
  vdouble u, s, x, y;
  vopmask o;
  vint ql;

  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
    ql = vrint_vi_vd(dql);
    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);
    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), x);
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1e+6))))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
    ql = vrint_vi_vd(dql);

    x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), x);
    x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), x);
    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), x);
    x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), x);
    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), x);
    x = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), x);
  } else {
    ddi_t ddi = rempi(d);
    ql = ddigeti_vi_ddi(ddi);
    x = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
    x = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(x)));
    x = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(x)));
  }

  x = vmul_vd_vd_vd(x, vcast_vd_d(0.5));
  s = vmul_vd_vd_vd(x, x);

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(s, s2, s4,
	     +0.3245098826639276316e-3,
	     +0.5619219738114323735e-3,
	     +0.1460781502402784494e-2,
	     +0.3591611540792499519e-2,
	     +0.8863268409563113126e-2,
	     +0.2186948728185535498e-1,
	     +0.5396825399517272970e-1,
	     +0.1333333333330500581e+0);

  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0));
  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x);

  y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1));
  x = vmul_vd_vd_vd(u, vcast_vd_d(-2));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));
  u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x),
		    vsel_vd_vo_vd_vd(o, x, y));
  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);

  return u;

#else // #if !defined(DETERMINISTIC)

  vdouble u, s, x, y;
  vopmask o;
  vint ql;

  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
  ql = vrint_vi_vd(dql);
  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);
  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s);
  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u);
    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u);
    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u);

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));
    s = vsel_vd_vo_vd_vd(g, s, u);
    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1e+6));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(d);
      vint ql2 = ddigeti_vi_ddi(ddi);
      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));
      u = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(u)));

      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);
      s = vsel_vd_vo_vd_vd(g, s, u);
    }
  }

  x = vmul_vd_vd_vd(s, vcast_vd_d(0.5));
  s = vmul_vd_vd_vd(x, x);

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(s, s2, s4,
	     +0.3245098826639276316e-3,
	     +0.5619219738114323735e-3,
	     +0.1460781502402784494e-2,
	     +0.3591611540792499519e-2,
	     +0.8863268409563113126e-2,
	     +0.2186948728185535498e-1,
	     +0.5396825399517272970e-1,
	     +0.1333333333330500581e+0);

  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0));
  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x);

  y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1));
  x = vmul_vd_vd_vd(u, vcast_vd_d(-2));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));
  u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x),
		    vsel_vd_vo_vd_vd(o, x, y));
  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) {
#if !defined(DETERMINISTIC)
  vdouble u;
  vdouble2 s, t, x, y;
  vopmask o;
  vint ql;
  
  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {
    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
    ql = vrint_vi_vd(dql);
    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);
    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    s = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d),
			  vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)),
							 vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh));
    const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)));
    ql = vrint_vi_vd(dql);

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));
    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));
  } else {
    ddi_t ddi = rempi(d);
    ql = ddigeti_vi_ddi(ddi);
    s = ddigetdd_vd2_ddi(ddi);
    o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));
    s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(s)))));
    s = vd2sety_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(s)))));
  }

  t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5));
  s = ddsqu_vd2_vd2(t);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(vd2getx_vd_vd2(s), s2, s4,
	     +0.3245098826639276316e-3,
	     +0.5619219738114323735e-3,
	     +0.1460781502402784494e-2,
	     +0.3591611540792499519e-2,
	     +0.8863268409563113126e-2,
	     +0.2186948728185535498e-1,
	     +0.5396825399517272970e-1,
	     +0.1333333333330500581e+0);

  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(+0.3333333333333343695e+0));
  x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u));

  y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x));
  x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));

  x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x),
			vsel_vd2_vo_vd2_vd2(o, x, y));

  u = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);

  return u;

#else // #if !defined(DETERMINISTIC)

  vdouble u;
  vdouble2 s, t, x, y;
  vopmask o;
  vint ql;
  
  const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
  ql = vrint_vi_vd(dql);
  u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);
  s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));

  if (!LIKELY(vtestallones_i_vo64(g))) {
    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
    x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d),
			  vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)),
							 vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh));
    const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)));

    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
    x = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));
    x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));

    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));
    s = vsel_vd2_vo_vd2_vd2(g, s, x);
    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));

    if (!LIKELY(vtestallones_i_vo64(g))) {
      ddi_t ddi = rempi(d);
      x = ddigetdd_vd2_ddi(ddi);
      o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));
      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
      x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));

      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi));
      s = vsel_vd2_vo_vd2_vd2(g, s, x);
    }
  }
  
  t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5));
  s = ddsqu_vd2_vd2(t);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);
  u = POLY8(vd2getx_vd_vd2(s), s2, s4,
	     +0.3245098826639276316e-3,
	     +0.5619219738114323735e-3,
	     +0.1460781502402784494e-2,
	     +0.3591611540792499519e-2,
	     +0.8863268409563113126e-2,
	     +0.2186948728185535498e-1,
	     +0.5396825399517272970e-1,
	     +0.1333333333330500581e+0);

  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(+0.3333333333333343695e+0));
  x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u));

  y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x));
  x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2));

  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));

  x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x),
			vsel_vd2_vo_vd2_vd2(o, x, y));

  u = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));

  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);

  return u;
#endif // #if !defined(DETERMINISTIC)
}

static INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) {
  vdouble s, t, u;
  vint q;
  vopmask p;

  q = vsel_vi_vd_vi(x, vcast_vi_i(-2));
  x = vabs_vd_vd(x);

  q = vsel_vi_vd_vd_vi_vi(x, y, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);
  p = vlt_vo_vd_vd(x, y);
  s = vsel_vd_vo_vd_vd(p, vneg_vd_vd(x), y);
  t = vmax_vd_vd_vd(x, y);

  s = vdiv_vd_vd_vd(s, t);
  t = vmul_vd_vd_vd(s, s);

  vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8);
  u = POLY19(t, t2, t4, t8, t16,
	     -1.88796008463073496563746e-05,
	     0.000209850076645816976906797,
	     -0.00110611831486672482563471,
	     0.00370026744188713119232403,
	     -0.00889896195887655491740809,
	     0.016599329773529201970117,
	     -0.0254517624932312641616861,
	     0.0337852580001353069993897,
	     -0.0407629191276836500001934,
	     0.0466667150077840625632675,
	     -0.0523674852303482457616113,
	     0.0587666392926673580854313,
	     -0.0666573579361080525984562,
	     0.0769219538311769618355029,
	     -0.090908995008245008229153,
	     0.111111105648261418443745,
	     -0.14285714266771329383765,
	     0.199999999996591265594148,
	     -0.333333333333311110369124);
  
  t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s);
  t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t);

  return t;
}

static INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) {
  vdouble u;
  vdouble2 s, t;
  vint q;
  vopmask p;

  q = vsel_vi_vd_vi(vd2getx_vd_vd2(x), vcast_vi_i(-2));
  p = vlt_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(0));
  vmask b = vand_vm_vo64_vm(p, vreinterpret_vm_vd(vcast_vd_d(-0.0)));
  x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));
  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));

  q = vsel_vi_vd_vd_vi_vi(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vadd_vi_vi_vi(q, vcast_vi_i(1)), q);
  p = vlt_vo_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
  s = vsel_vd2_vo_vd2_vd2(p, ddneg_vd2_vd2(x), y);
  t = vsel_vd2_vo_vd2_vd2(p, y, x);

  s = dddiv_vd2_vd2_vd2(s, t);
  t = ddsqu_vd2_vd2(s);
  t = ddnormalize_vd2_vd2(t);

  vdouble t2 = vmul_vd_vd_vd(vd2getx_vd_vd2(t), vd2getx_vd_vd2(t)), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8);
  u = POLY16(vd2getx_vd_vd2(t), t2, t4, t8,
	     1.06298484191448746607415e-05,
	     -0.000125620649967286867384336,
	     0.00070557664296393412389774,
	     -0.00251865614498713360352999,
	     0.00646262899036991172313504,
	     -0.0128281333663399031014274,
	     0.0208024799924145797902497,
	     -0.0289002344784740315686289,
	     0.0359785005035104590853656,
	     -0.041848579703592507506027,
	     0.0470843011653283988193763,
	     -0.0524914210588448421068719,
	     0.0587946590969581003860434,
	     -0.0666620884778795497194182,
	     0.0769225330296203768654095,
	     -0.0909090442773387574781907);
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(0.111111108376896236538123));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(-0.142857142756268568062339));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(0.199999999997977351284817));
  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(-0.333333333333317605173818));

  t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u));
  
  t = ddadd_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t);

  return t;
}

static INLINE CONST VECTOR_CC vdouble visinf2_vd_vd_vd(vdouble d, vdouble m) {
  return vreinterpret_vd_vm(vand_vm_vo64_vm(visinf_vo_vd(d), vor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(m))));
}

EXPORT CONST VECTOR_CC vdouble xatan2(vdouble y, vdouble x) {
  vdouble r = atan2k(vabs_vd_vd(y), x);

  r = vmulsign_vd_vd_vd(r, x);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r);
  r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r);

  r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y))));
  return r;
}

EXPORT CONST VECTOR_CC vdouble xatan2_u1(vdouble y, vdouble x) {
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(5.5626846462680083984e-309)); // nexttoward((1.0 / DBL_MAX), 1)
  x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 53)), x);
  y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(UINT64_C(1) << 53)), y);

  vdouble2 d = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(y), vcast_vd_d(0)), vcast_vd2_vd_vd(x, vcast_vd_d(0)));
  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));

  r = vmulsign_vd_vd_vd(r, x);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r);
  r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r);

  r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y))));
  return r;
}

EXPORT CONST VECTOR_CC vdouble xasin(vdouble d) {
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5)));
  vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)), u;

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u = vmla_vd_vd_vd_vd(u, vmul_vd_vd_vd(x, x2), x);
  
  vdouble r = vsel_vd_vo_vd_vd(o, u, vmla_vd_vd_vd_vd(u, vcast_vd_d(-2), vcast_vd_d(M_PI/2)));
  return vmulsign_vd_vd_vd(r, d);
}

EXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) {
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;
  vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2));
  x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x);

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)));

  vdouble2 y = ddsub_vd2_vd2_vd(ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), u);
  
  vdouble r = vsel_vd_vo_vd_vd(o, vadd_vd_vd_vd(u, vd2getx_vd_vd2(x)),
			       vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)), vcast_vd_d(2)));
  return vmulsign_vd_vd_vd(r, d);
}

EXPORT CONST VECTOR_CC vdouble xacos(vdouble d) {
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d),
				vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;
  vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2));
  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd_d(0), x);

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x));

  vdouble y = vsub_vd_vd_vd(vcast_vd_d(M_PI/2), vadd_vd_vd_vd(vmulsign_vd_vd_vd(x, d), vmulsign_vd_vd_vd(u, d)));
  x = vadd_vd_vd_vd(x, u);
  vdouble r = vsel_vd_vo_vd_vd(o, y, vmul_vd_vd_vd(x, vcast_vd_d(2)));
  return vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))),
			  vd2getx_vd_vd2(ddadd_vd2_vd2_vd(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16),
							  vneg_vd_vd(r))), r);
}

EXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) {
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;
  vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2));
  x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x);

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);
  u = POLY12(x2, x4, x8, x16,
	     +0.3161587650653934628e-1,
	     -0.1581918243329996643e-1,
	     +0.1929045477267910674e-1,
	     +0.6606077476277170610e-2,
	     +0.1215360525577377331e-1,
	     +0.1388715184501609218e-1,
	     +0.1735956991223614604e-1,
	     +0.2237176181932048341e-1,
	     +0.3038195928038132237e-1,
	     +0.4464285681377102438e-1,
	     +0.7500000000378581611e-1,
	     +0.1666666666666497543e+0);

  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)));

  vdouble2 y = ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/2, 1.2246467991473532072e-16/2),
				 ddadd_vd2_vd_vd(vmulsign_vd_vd_vd(vd2getx_vd_vd2(x), d), vmulsign_vd_vd_vd(u, d)));
  x = ddadd_vd2_vd2_vd(x, u);
  
  y = vsel_vd2_vo_vd2_vd2(o, y, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
  
  y = vsel_vd2_vo_vd2_vd2(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))),
			  ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), y), y);

  return vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y));
}

EXPORT CONST VECTOR_CC vdouble xatan_u1(vdouble d) {
  vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0));
  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2));
  r = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(1.570796326794896557998982), r);
  return vmulsign_vd_vd_vd(r, d);
}

EXPORT CONST VECTOR_CC vdouble xatan(vdouble s) {
  vdouble t, u;
  vint q;
#if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR)
  vdouble w = s;
#endif

  q = vsel_vi_vd_vi(s, vcast_vi_i(2));
  s = vabs_vd_vd(s);

  q = vsel_vi_vd_vd_vi_vi(vcast_vd_d(1), s, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);
  s = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vcast_vd_d(1), s), vrec_vd_vd(s), s);

  t = vmul_vd_vd_vd(s, s);

  vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8);
  u = POLY19(t, t2, t4, t8, t16,
	     -1.88796008463073496563746e-05,
	     0.000209850076645816976906797,
	     -0.00110611831486672482563471,
	     0.00370026744188713119232403,
	     -0.00889896195887655491740809,
	     0.016599329773529201970117,
	     -0.0254517624932312641616861,
	     0.0337852580001353069993897,
	     -0.0407629191276836500001934,
	     0.0466667150077840625632675,
	     -0.0523674852303482457616113,
	     0.0587666392926673580854313,
	     -0.0666573579361080525984562,
	     0.0769219538311769618355029,
	     -0.090908995008245008229153,
	     0.111111105648261418443745,
	     -0.14285714266771329383765,
	     0.199999999996591265594148,
	     -0.333333333333311110369124);
  
  t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s);

  t = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), t), t);
  t = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(t)));

#if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR)
  t = vsel_vd_vo_vd_vd(veq_vo_vd_vd(w, vcast_vd_d(0)), w, t);
#endif

  return t;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vdouble xlog(vdouble d) {
  vdouble x, x2;
  vdouble t, m;
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  m = vgetmant_vd_vd(d);
#endif
  
  x = vdiv_vd_vd_vd(vsub_vd_vd_vd(m, vcast_vd_d(1)), vadd_vd_vd_vd(vcast_vd_d(1), m));
  x2 = vmul_vd_vd_vd(x, x);

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x3 = vmul_vd_vd_vd(x, x2);
  t = POLY7(x2, x4, x8,
	    0.153487338491425068243146,
	    0.152519917006351951593857,
	    0.181863266251982985677316,
	    0.222221366518767365905163,
	    0.285714294746548025383248,
	    0.399999999950799600689777,
	    0.6666666666667778740063);

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
  x = vmla_vd_vd_vd_vd(x3, t, x);

  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x);
  x = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), x);
  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), x);
#else
  x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), e));
  x = vmla_vd_vd_vd_vd(x3, t, x);

  x = vfixup_vd_vd_vd_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0);
#endif

  return x;
}
#endif // #if !defined(DETERMINISTIC)

EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) {
  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s;
  vint q = vrint_vi_vd(u);

  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d);
  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s);

#ifdef ENABLE_FMA_DP
  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY10(s, s2, s4, s8, 
	     +0.2081276378237164457e-8,
	     +0.2511210703042288022e-7,
	     +0.2755762628169491192e-6,
	     +0.2755723402025388239e-5,
	     +0.2480158687479686264e-4,
	     +0.1984126989855865850e-3,
	     +0.1388888888914497797e-2,
	     +0.8333333333314938210e-2,
	     +0.4166666666666602598e-1,
	     +0.1666666666666669072e+0);
  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0));
  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1));
  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1));
#else // #ifdef ENABLE_FMA_DP
  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY10(s, s2, s4, s8,
	     2.08860621107283687536341e-09,
	     2.51112930892876518610661e-08,
	     2.75573911234900471893338e-07,
	     2.75572362911928827629423e-06,
	     2.4801587159235472998791e-05,
	     0.000198412698960509205564975,
	     0.00138888888889774492207962,
	     0.00833333333331652721664984,
	     0.0416666666666665047591422,
	     0.166666666666666851703837);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0));

  u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s));
#endif // #ifdef ENABLE_FMA_DP
  
  u = vldexp2_vd_vd_vi(u, q);

  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), u);
  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), vreinterpret_vm_vd(u)));

  return u;
}

static INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) {
  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s;
  vint q = vrint_vi_vd(u);

  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d);
  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s);

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY10(s, s2, s4, s8,
	     2.08860621107283687536341e-09,
	     2.51112930892876518610661e-08,
	     2.75573911234900471893338e-07,
	     2.75572362911928827629423e-06,
	     2.4801587159235472998791e-05,
	     0.000198412698960509205564975,
	     0.00138888888889774492207962,
	     0.00833333333331652721664984,
	     0.0416666666666665047591422,
	     0.166666666666666851703837);

  u = vadd_vd_vd_vd(vmla_vd_vd_vd_vd(s2, vcast_vd_d(0.5), vmul_vd_vd_vd(vmul_vd_vd_vd(s2, s), u)), s);
  
  u = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(q, vcast_vi_i(0))), u,
		       vsub_vd_vd_vd(vldexp2_vd_vd_vi(vadd_vd_vd_vd(u, vcast_vd_d(1)), q), vcast_vd_d(1)));

  return u;
}

static INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) {
  vdouble2 x, x2, s;
  vdouble t, m;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  m = vgetmant_vd_vd(d);
#endif

  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
  x2 = ddsqu_vd2_vd2(x);

  vdouble x4 = vmul_vd_vd_vd(vd2getx_vd_vd2(x2), vd2getx_vd_vd2(x2)), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);
  t = POLY9(vd2getx_vd_vd2(x2), x4, x8, x16,
	    0.116255524079935043668677,
	    0.103239680901072952701192,
	    0.117754809412463995466069,
	    0.13332981086846273921509,
	    0.153846227114512262845736,
	    0.181818180850050775676507,
	    0.222222222230083560345903,
	    0.285714285714249172087875,
	    0.400000000000000077715612);

  vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17);
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
#else
  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
#endif
  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
  x = ddmul_vd2_vd2_vd2(x2, x);
  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, c));
  x = ddmul_vd2_vd2_vd2(x2, x);
  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(x, t));

  return s;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) {
  vdouble2 x;
  vdouble t, m, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  m = vgetmant_vd_vd(d);
#endif

  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);
  t = POLY7(x2, x4, x8,
	    0.1532076988502701353e+0,
	    0.1525629051003428716e+0,
	    0.1818605932937785996e+0,
	    0.2222214519839380009e+0,
	    0.2857142932794299317e+0,
	    0.3999999999635251990e+0,
	    0.6666666666667333541e+0);

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
#else
  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
#endif

  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));

  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);
#else
  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) {
  vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2));
  vdouble dq = vrint_vd_vd(u);
  vint q = vrint_vi_vd(dq);
  vdouble2 s, t;

  s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U)));
  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L)));

  s = ddnormalize_vd2_vd2(s);

  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY10(vd2getx_vd_vd2(s), s2, s4, s8,
	     2.51069683420950419527139e-08,
	     2.76286166770270649116855e-07,
	     2.75572496725023574143864e-06,
	     2.48014973989819794114153e-05,
	     0.000198412698809069797676111,
	     0.0013888888939977128960529,
	     0.00833333333332371417601081,
	     0.0416666666665409524128449,
	     0.166666666666666740681535,
	     0.500000000000000999200722);

  t = ddadd_vd2_vd_vd2(vcast_vd_d(1), s);
  t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u));

  u = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
  u = vldexp2_vd_vd_vi(u, q);

  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(u)));
  
  return u;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) {
#if 1
  vopmask yisint = visint_vo_vd(y);
  vopmask yisodd = vand_vo_vo_vo(visodd_vo_vd(y), yisint);

  vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y);
  vdouble result = expk(d);
  result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), result);

  result = vmul_vd_vd_vd(result,
			 vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)),
					  vcast_vd_d(1),
					  vsel_vd_vo_vd_vd(yisint, vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)), vcast_vd_d(SLEEF_NAN))));

  vdouble efx = vmulsign_vd_vd_vd(vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), y);

  result = vsel_vd_vo_vd_vd(visinf_vo_vd(y),
			    vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(efx, vcast_vd_d(0.0)),
								  vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(efx, vcast_vd_d(0.0)),
												      vcast_vd_d(1.0),
												      vcast_vd_d(SLEEF_INFINITY))))),
			    result);

  result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0.0))),
			    vmul_vd_vd_vd(vsel_vd_vo_vd_vd(yisodd, vsign_vd_vd(x), vcast_vd_d(1.0)),
					  vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0.0)), vneg_vd_vd(y), y), vcast_vd_d(0.0)),
										vreinterpret_vm_vd(vcast_vd_d(SLEEF_INFINITY))))),
			    result);

  result = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(result)));

  result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(y, vcast_vd_d(0)), veq_vo_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result);

  return result;
#else
  return expk(ddmul_vd2_vd2_vd(logk(x), y));
#endif
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) {
  vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2));
  vdouble dq = vrint_vd_vd(u);
  vint q = vrint_vi_vd(dq);
  vdouble2 s, t;

  s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U)));
  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L)));

  vdouble2 s2 = ddsqu_vd2_vd2(s), s4 = ddsqu_vd2_vd2(s2);
  vdouble s8 = vmul_vd_vd_vd(vd2getx_vd_vd2(s4), vd2getx_vd_vd2(s4));
  u = POLY10(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s2), vd2getx_vd_vd2(s4), s8,
	     +0.1602472219709932072e-9,
	     +0.2092255183563157007e-8,
	     +0.2505230023782644465e-7,
	     +0.2755724800902135303e-6,
	     +0.2755731892386044373e-5,
	     +0.2480158735605815065e-4,
	     +0.1984126984148071858e-3,
	     +0.1388888888886763255e-2,
	     +0.8333333333333347095e-2,
	     +0.4166666666666669905e-1);

  t = ddadd_vd2_vd_vd2(vcast_vd_d(0.5), ddmul_vd2_vd2_vd(s, vcast_vd_d(+0.1666666666666666574e+0)));
  t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s));
  t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s));
  t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(s4, u));

  t = vd2setx_vd2_vd2_vd(t, vldexp2_vd_vd_vi(vd2getx_vd_vd2(t), q));
  t = vd2sety_vd2_vd2_vd(t, vldexp2_vd_vd_vi(vd2gety_vd_vd2(t), q));

  t = vd2setx_vd2_vd2_vd(t, vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(vd2getx_vd_vd2(t)))));
  t = vd2sety_vd2_vd2_vd(t, vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(vd2gety_vd_vd2(t)))));

  return t;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) {
  vdouble y = vabs_vd_vd(x);
  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));
  d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d));
  y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);
  y = vmulsign_vd_vd_vd(y, x);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

EXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) {
  vdouble y = vabs_vd_vd(x);
  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));
  d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d));
  y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

EXPORT CONST VECTOR_CC vdouble xtanh(vdouble x) {
  vdouble y = vabs_vd_vd(x);
  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));
  vdouble2 e = ddrec_vd2_vd2(d);
  d = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddneg_vd2_vd2(e)), ddadd2_vd2_vd2_vd2(d, e));
  y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y);
  y = vmulsign_vd_vd_vd(y, x);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

EXPORT CONST VECTOR_CC vdouble xsinh_u35(vdouble x) {
  vdouble e = expm1k(vabs_vd_vd(x));

  vdouble y = vdiv_vd_vd_vd(vadd_vd_vd_vd(e, vcast_vd_d(2)), vadd_vd_vd_vd(e, vcast_vd_d(1)));
  y = vmul_vd_vd_vd(y, vmul_vd_vd_vd(vcast_vd_d(0.5), e));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(709)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);
  y = vmulsign_vd_vd_vd(y, x);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

EXPORT CONST VECTOR_CC vdouble xcosh_u35(vdouble x) {
  vdouble e = xexp(vabs_vd_vd(x));
  vdouble y = vmla_vd_vd_vd_vd(vcast_vd_d(0.5), e, vdiv_vd_vd_vd(vcast_vd_d(0.5), e));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(709)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

EXPORT CONST VECTOR_CC vdouble xtanh_u35(vdouble x) {
  vdouble d = expm1k(vmul_vd_vd_vd(vcast_vd_d(2), vabs_vd_vd(x)));
  vdouble y = vdiv_vd_vd_vd(d, vadd_vd_vd_vd(vcast_vd_d(2), d));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y);
  y = vmulsign_vd_vd_vd(y, x);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

static INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) {
  vdouble2 x, x2, m, s;
  vdouble t;
  vint e;
  
  e = vilogbk_vi_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(1.0/0.75)));

  m = vd2setxy_vd2_vd_vd(vldexp2_vd_vd_vi(vd2getx_vd_vd2(d), vneg_vi_vi(e)), 
			 vldexp2_vd_vd_vi(vd2gety_vd_vd2(d), vneg_vi_vi(e)));

  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1)));
  x2 = ddsqu_vd2_vd2(x);

  vdouble x4 = vmul_vd_vd_vd(vd2getx_vd_vd2(x2), vd2getx_vd_vd2(x2)), x8 = vmul_vd_vd_vd(x4, x4);
  t = POLY7(vd2getx_vd_vd2(x2), x4, x8,
	    0.13860436390467167910856,
	    0.131699838841615374240845,
	    0.153914168346271945653214,
	    0.181816523941564611721589,
	    0.22222224632662035403996,
	    0.285714285511134091777308,
	    0.400000000000914013309483);
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(x2), vcast_vd_d(0.666666666666664853302393));

  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t));

  return  s;
}

EXPORT CONST VECTOR_CC vdouble xasinh(vdouble x) {
  vdouble y = vabs_vd_vd(x);
  vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1));
  vdouble2 d;
  
  d = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd(x), vcast_vd2_vd_vd(y, vcast_vd_d(0)));
  d = ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(d), vcast_vd_d(1)));
  d = vsel_vd2_vo_vd2_vd2(o, ddmul_vd2_vd2_vd(d, y), d);

  d = logk2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(d, x)));
  y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));
  
  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),
				    visnan_vo_vd(y)),
		       vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), x), y);

  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
  y = vsel_vd_vo_vd_vd(visnegzero_vo_vd(x), vcast_vd_d(-0.0), y);
  
  return y;
}

EXPORT CONST VECTOR_CC vdouble xacosh(vdouble x) {
  vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x));
  vdouble y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));

  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),
				    visnan_vo_vd(y)),
		       vcast_vd_d(SLEEF_INFINITY), y);
  y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y)));

  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y)));
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
  
  return y;
}

EXPORT CONST VECTOR_CC vdouble xatanh(vdouble x) {
  vdouble y = vabs_vd_vd(x);
  vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y))));
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(SLEEF_INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5))))));

  y = vmulsign_vd_vd_vd(y, x);
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(y)));
  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));

  return y;
}

EXPORT CONST VECTOR_CC vdouble xcbrt(vdouble d) {
  vdouble x, y, q = vcast_vd_d(1.0);
  vint e, qu, re;
  vdouble t;

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  vdouble s = d;
#endif
  e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));
  d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e));

  t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144));
  qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0)));
  re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3))));

  q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd_d(1.2599210498948731647672106), q);
  q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd_d(1.5874010519681994747517056), q);
  q = vldexp2_vd_vd_vi(q, vsub_vi_vi_vi(qu, vcast_vi_i(2048)));

  q = vmulsign_vd_vd_vd(q, d);

  d = vabs_vd_vd(d);

  x = vcast_vd_d(-0.640245898480692909870982);
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722));

  y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0)));
  y = vmul_vd_vd_vd(vmul_vd_vd_vd(d, x), x);
  y = vmul_vd_vd_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(2.0 / 3.0), y), vmla_vd_vd_vd_vd(y, x, vcast_vd_d(-1.0)))), q);

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  y = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), s), y);
  y = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), y);
#endif
  
  return y;
}

EXPORT CONST VECTOR_CC vdouble xcbrt_u1(vdouble d) {
  vdouble x, y, z, t;
  vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v;
  vint e, qu, re;

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  vdouble s = d;
#endif
  e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));
  d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e));

  t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144));
  qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0)));
  re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3))));

  q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd2_d_d(1.2599210498948731907, -2.5899333753005069177e-17), q2);
  q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd2_d_d(1.5874010519681995834, -1.0869008194197822986e-16), q2);

  q2 = vd2setxy_vd2_vd_vd(vmulsign_vd_vd_vd(vd2getx_vd_vd2(q2), d), vmulsign_vd_vd_vd(vd2gety_vd_vd2(q2), d));
  d = vabs_vd_vd(d);

  x = vcast_vd_d(-0.640245898480692909870982);
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632));
  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722));

  y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0)));

  z = x;

  u = ddmul_vd2_vd_vd(x, x);
  u = ddmul_vd2_vd2_vd2(u, u);
  u = ddmul_vd2_vd2_vd(u, d);
  u = ddadd2_vd2_vd2_vd(u, vneg_vd_vd(x));
  y = vadd_vd_vd_vd(vd2getx_vd_vd2(u), vd2gety_vd_vd2(u));

  y = vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(-2.0 / 3.0), y), z);
  v = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(z, z), y);
  v = ddmul_vd2_vd2_vd(v, d);
  v = ddmul_vd2_vd2_vd2(v, q2);
  z = vldexp2_vd_vd_vi(vadd_vd_vd_vd(vd2getx_vd_vd2(v), vd2gety_vd_vd2(v)), vsub_vi_vi_vi(qu, vcast_vi_i(2048)));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  z = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), vd2getx_vd_vd2(q2)), z);
  z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vreinterpret_vd_vm(vsignbit_vm_vd(vd2getx_vd_vd2(q2))), z);
#else
  z = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), s), z);
  z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), z);
#endif
  
  return z;
}
#endif // #if !defined(DETERMINISTIC)

EXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) {
  vdouble u = vrint_vd_vd(d), s;
  vint q = vrint_vi_vd(u);

  s = vsub_vd_vd_vd(d, u);

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY10(s, s2, s4, s8,
	     +0.4434359082926529454e-9,
	     +0.7073164598085707425e-8,
	     +0.1017819260921760451e-6,
	     +0.1321543872511327615e-5,
	     +0.1525273353517584730e-4,
	     +0.1540353045101147808e-3,
	     +0.1333355814670499073e-2,
	     +0.9618129107597600536e-2,
	     +0.5550410866482046596e-1,
	     +0.2402265069591012214e+0);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0));
  
#ifdef ENABLE_FMA_DP
  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1));
#else
  u = vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s))));
#endif
  
  u = vldexp2_vd_vd_vi(u, q);

  u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(SLEEF_INFINITY), u);
  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u)));

  return u;
}

EXPORT CONST VECTOR_CC vdouble xexp2_u35(vdouble d) {
  vdouble u = vrint_vd_vd(d), s;
  vint q = vrint_vi_vd(u);

  s = vsub_vd_vd_vd(d, u);

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY10(s, s2, s4, s8,
	     +0.4434359082926529454e-9,
	     +0.7073164598085707425e-8,
	     +0.1017819260921760451e-6,
	     +0.1321543872511327615e-5,
	     +0.1525273353517584730e-4,
	     +0.1540353045101147808e-3,
	     +0.1333355814670499073e-2,
	     +0.9618129107597600536e-2,
	     +0.5550410866482046596e-1,
	     +0.2402265069591012214e+0);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0));
  
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));
  
  u = vldexp2_vd_vd_vi(u, q);

  u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(SLEEF_INFINITY), u);
  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u)));

  return u;
}

EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) {
  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s;
  vint q = vrint_vi_vd(u);

  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d);
  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s);

  u = vcast_vd_d(+0.2411463498334267652e-3);
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1157488415217187375e-2));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5013975546789733659e-2));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1959762320720533080e-1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6808936399446784138e-1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2069958494722676234e+0));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5393829292058536229e+0));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1171255148908541655e+1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2034678592293432953e+1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2650949055239205876e+1));
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2302585092994045901e+1));
  
#ifdef ENABLE_FMA_DP
  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1));
#else
  u = vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s))));
#endif
  
  u = vldexp2_vd_vd_vi(u, q);

  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(SLEEF_INFINITY), u);
  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u)));

  return u;
}

EXPORT CONST VECTOR_CC vdouble xexp10_u35(vdouble d) {
  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s;
  vint q = vrint_vi_vd(u);

  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d);
  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s);

  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);
  u = POLY11(s, s2, s4, s8,
	     +0.2411463498334267652e-3,
	     +0.1157488415217187375e-2,
	     +0.5013975546789733659e-2,
	     +0.1959762320720533080e-1,
	     +0.6808936399446784138e-1,
	     +0.2069958494722676234e+0,
	     +0.5393829292058536229e+0,
	     +0.1171255148908541655e+1,
	     +0.2034678592293432953e+1,
	     +0.2650949055239205876e+1,
	     +0.2302585092994045901e+1);
  
  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));
  
  u = vldexp2_vd_vd_vi(u, q);

  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(SLEEF_INFINITY), u);
  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u)));

  return u;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vdouble xexpm1(vdouble a) {
  vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
  vdouble x = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));
  x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(SLEEF_INFINITY), x);
  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-36.736800569677101399113302437)), vcast_vd_d(-1), x);
  x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x);
  return x;
}

EXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) {
  vdouble2 x;
  vdouble t, m, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  m = vgetmant_vd_vd(d);
#endif

  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);
  t = POLY7(x2, x4, x8,
	    +0.6653725819576758460e-1,
	    +0.6625722782820833712e-1,
	    +0.7898105214313944078e-1,
	    +0.9650955035715275132e-1,
	    +0.1240841409721444993e+0,
	    +0.1737177927454605086e+0,
	    +0.2895296546021972617e+0);
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), vcast_vd_vi(e));
#else
  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), e);
#endif

  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(0.86858896380650363334, 1.1430059694096389311e-17)));
  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));

  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);
#else
  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}

EXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) {
  vdouble2 x;
  vdouble t, m, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  m = vgetmant_vd_vd(d);
#endif

  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);
  t = POLY7(x2, x4, x8,
	    +0.2211941750456081490e+0,
	    +0.2200768693152277689e+0,
	    +0.2623708057488514656e+0,
	    +0.3205977477944495502e+0,
	    +0.4121985945485324709e+0,
	    +0.5770780162997058982e+0,
	    +0.96179669392608091449);
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vdouble2 s = ddadd2_vd2_vd_vd2(vcast_vd_vi(e),
				 ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(2.885390081777926774, 6.0561604995516736434e-18)));
#else
  vdouble2 s = ddadd2_vd2_vd_vd2(e,
				 ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(2.885390081777926774, 6.0561604995516736434e-18)));
#endif

  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));

  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);
#else
  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}

EXPORT CONST VECTOR_CC vdouble xlog2_u35(vdouble d) {
  vdouble m, t, x, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  m = vgetmant_vd_vd(d);
#endif

  x = vdiv_vd_vd_vd(vsub_vd_vd_vd(m, vcast_vd_d(1)), vadd_vd_vd_vd(m, vcast_vd_d(1)));
  x2 = vmul_vd_vd_vd(x, x);

  t = vcast_vd_d(+0.2211941750456081490e+0);
  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2200768693152277689e+0));
  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2623708057488514656e+0));
  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.3205977477944495502e+0));
  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.4121985945485324709e+0));
  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.5770780162997058982e+0));
  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.96179669392608091449  ));
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vdouble2 s = ddadd_vd2_vd_vd2(vcast_vd_vi(e),
				ddmul_vd2_vd_vd(x, vcast_vd_d(2.885390081777926774)));
#else
  vdouble2 s = ddadd_vd2_vd_vd2(e,
				ddmul_vd2_vd_vd(x, vcast_vd_d(2.885390081777926774)));
#endif

  vdouble r = vmla_vd_vd_vd_vd(t, vmul_vd_vd_vd(x, x2), vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);
#else
  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}

EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) {
  vdouble2 x;
  vdouble t, m, x2;

  vdouble dp1 = vadd_vd_vd_vd(d, vcast_vd_d(1));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vd_vd(dp1, vcast_vd_d(DBL_MIN));
  dp1 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(dp1, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), dp1);
  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75)));
  t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(e));
  m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1)));
  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
#else
  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75)));
  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
  t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(vrint_vi_vd(e)));
  m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1)));
  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
#endif

  x = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(m, vcast_vd_d(0)), ddadd_vd2_vd_vd(vcast_vd_d(2), m));
  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));

  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);
  t = POLY7(x2, x4, x8,
	    0.1532076988502701353e+0,
	    0.1525629051003428716e+0,
	    0.1818605932937785996e+0,
	    0.2222214519839380009e+0,
	    0.2857142932794299317e+0,
	    0.3999999999635251990e+0,
	    0.6666666666667333541e+0);
  
  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));

  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));
  
  r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(1e+307)), vcast_vd_d(SLEEF_INFINITY), r);
  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(-1)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);
  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(-1)), vcast_vd_d(-SLEEF_INFINITY), r);
  r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r);
  
  return r;
}

//

static INLINE CONST VECTOR_CC vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); }

EXPORT CONST VECTOR_CC vdouble xfabs(vdouble x) { return vabs_vd_vd(x); }

EXPORT CONST VECTOR_CC vdouble xcopysign(vdouble x, vdouble y) { return vcopysign_vd_vd_vd(x, y); }

EXPORT CONST VECTOR_CC vdouble xfmax(vdouble x, vdouble y) {
#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)
  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmax_vd_vd_vd(x, y));
#else
  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, y), x, y));
#endif
}

EXPORT CONST VECTOR_CC vdouble xfmin(vdouble x, vdouble y) {
#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)
  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmin_vd_vd_vd(x, y));
#else
  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(y, x), x, y));
#endif
}

EXPORT CONST VECTOR_CC vdouble xfdim(vdouble x, vdouble y) {
  vdouble ret = vsub_vd_vd_vd(x, y);
  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(ret, vcast_vd_d(0)), veq_vo_vd_vd(x, y)), vcast_vd_d(0), ret);
  return ret;
}

EXPORT CONST VECTOR_CC vdouble xtrunc(vdouble x) {
#ifdef FULL_FP_ROUNDING
  return vtruncate_vd_vd(x);
#else
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
#endif
}

EXPORT CONST VECTOR_CC vdouble xfloor(vdouble x) {
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
}

EXPORT CONST VECTOR_CC vdouble xceil(vdouble x) {
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
}

EXPORT CONST VECTOR_CC vdouble xround(vdouble d) {
  vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);
  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);
  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
}

EXPORT CONST VECTOR_CC vdouble xrint(vdouble d) {
#ifdef FULL_FP_ROUNDING
  return vrint_vd_vd(d);
#else
  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);
  return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),
			  d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));
#endif
}

EXPORT CONST VECTOR_CC vdouble xnextafter(vdouble x, vdouble y) {
  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), y), x);
  vint2 t, xi2 = vreinterpret_vi2_vd(x);
  vopmask c = vxor_vo_vo_vo(vsignbit_vo_vd(x), vge_vo_vd_vd(y, x));

  t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1));
  t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0)))));
  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2)));

  xi2 = vsub_vi2_vi2_vi2(xi2, vcast_vi2_vm(vand_vm_vo64_vm(vneq_vo_vd_vd(x, y), vcast_vm_i_i(0, 1))));

  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(vneq_vo_vd_vd(x, y),
					     vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(xi2, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, -1), veq_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0, -1)))))),
					     vreinterpret_vd_vi2(xi2)));

  t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1));
  t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0)))));
  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2)));

  vdouble ret = vreinterpret_vd_vi2(xi2);

  ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(ret, vcast_vd_d(0)), vneq_vo_vd_vd(x, vcast_vd_d(0))), 
			 vmulsign_vd_vd_vd(vcast_vd_d(0), x), ret);

  ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), y, ret);

  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret);
  
  return ret;
}

EXPORT CONST VECTOR_CC vdouble xfrfrexp(vdouble x) {
  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x);

  vmask xm = vreinterpret_vm_vd(x);
  xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7ff00000, ~0));
  xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3fe00000,  0));

  vdouble ret = vreinterpret_vd_vm(xm);

  ret = vsel_vd_vo_vd_vd(visinf_vo_vd(x), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), x), ret);
  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), x, ret);
  
  return ret;
}

EXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) {
  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x);

  vint ret = vcastu_vi_vi2(vreinterpret_vi2_vd(x));
  ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe));

  ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), visnan_vo_vd(x)), visinf_vo_vd(x)), vcast_vi_i(0), ret);
  
  return ret;
}

EXPORT CONST VECTOR_CC vdouble xfma(vdouble x, vdouble y, vdouble z) {
#ifdef ENABLE_FMA_DP
  return vfma_vd_vd_vd_vd(x, y, z);
#else
  vdouble h2 = vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z), q = vcast_vd_d(1);
  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e-300));
  {
    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;
    x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(c1)), x);
    y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(c1)), y);
    z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(c2)), z);
    q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.0 / c2), q);
  }
  o = vgt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e+300));
  {
    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;
    x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(1.0 / c1)), x);
    y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(1.0 / c1)), y);
    z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(1.0 / c2)), z);
    q = vsel_vd_vo_vd_vd(o, vcast_vd_d(c2), q);
  }
  vdouble2 d = ddmul_vd2_vd_vd(x, y);
  d = ddadd2_vd2_vd2_vd(d, z);
  vdouble ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), z, vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));
  o = visinf_vo_vd(z);
  o = vandnot_vo_vo_vo(visinf_vo_vd(x), o);
  o = vandnot_vo_vo_vo(visnan_vo_vd(x), o);
  o = vandnot_vo_vo_vo(visinf_vo_vd(y), o);
  o = vandnot_vo_vo_vo(visnan_vo_vd(y), o);
  h2 = vsel_vd_vo_vd_vd(o, z, h2);

  o = vor_vo_vo_vo(visinf_vo_vd(h2), visnan_vo_vd(h2));
  
  return vsel_vd_vo_vd_vd(o, h2, vmul_vd_vd_vd(ret, q));
#endif
}

SQRTU05_FUNCATR VECTOR_CC vdouble xsqrt_u05(vdouble d) {
#if defined(ENABLE_FMA_DP)
  vdouble q, w, x, y, z;
  
  d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), d);

  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d);
  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39), vcast_vd_d(1));

  y = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec85, 0xe7de30da), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(d), 1)));

  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);

  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
  w = vmul_vd_vd_vd(w, y);
  x = vmul_vd_vd_vd(w, d);
  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));

  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
  w = vfma_vd_vd_vd_vd(w, z, y);
  w = vadd_vd_vd_vd(w, x);

  w = vmul_vd_vd_vd(w, q);

  w = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(d, vcast_vd_d(0)),
				    veq_vo_vd_vd(d, vcast_vd_d(SLEEF_INFINITY))), d, w);

  w = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), w);

  return w;
#else
  vdouble q;
  vopmask o;
  
  d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), d);

  o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d);
  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39*0.5), vcast_vd_d(0.5));

  o = vgt_vo_vd_vd(d, vcast_vd_d(1.3407807929942597e+154));
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(7.4583407312002070e-155)), d);
  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.1579208923731620e+77*0.5), q);

  vdouble x = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec86, 0), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(vadd_vd_vd_vd(d, vcast_vd_d(1e-320))), 1)));

  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));
  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));
  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));
  x = vmul_vd_vd_vd(x, d);

  vdouble2 d2 = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(x, x)), ddrec_vd2_vd(x));

  x = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2)), q);

  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x);
  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, x);
  
  return x;
#endif
}

EXPORT CONST VECTOR_CC vdouble xsqrt(vdouble d) {
#if defined(ACCURATE_SQRT)
  return vsqrt_vd_vd(d);
#else
  // fall back to approximation if ACCURATE_SQRT is undefined
  return xsqrt_u05(d);
#endif
}

EXPORT CONST VECTOR_CC vdouble xsqrt_u35(vdouble d) { return xsqrt_u05(d); }

EXPORT CONST VECTOR_CC vdouble xhypot_u05(vdouble x, vdouble y) {
  x = vabs_vd_vd(x);
  y = vabs_vd_vd(y);
  vdouble min = vmin_vd_vd_vd(x, y), n = min;
  vdouble max = vmax_vd_vd_vd(x, y), d = max;

  vopmask o = vlt_vo_vd_vd(max, vcast_vd_d(DBL_MIN));
  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n);
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d);

  vdouble2 t = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(n, vcast_vd_d(0)), vcast_vd2_vd_vd(d, vcast_vd_d(0)));
  t = ddmul_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(t), vcast_vd_d(1))), max);
  vdouble ret = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
  ret = vsel_vd_vo_vd_vd(visnan_vo_vd(ret), vcast_vd_d(SLEEF_INFINITY), ret);
  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret);
  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret);
  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(SLEEF_INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(SLEEF_INFINITY))), vcast_vd_d(SLEEF_INFINITY), ret);

  return ret;
}

EXPORT CONST VECTOR_CC vdouble xhypot_u35(vdouble x, vdouble y) {
  x = vabs_vd_vd(x);
  y = vabs_vd_vd(y);
  vdouble min = vmin_vd_vd_vd(x, y);
  vdouble max = vmax_vd_vd_vd(x, y);

  vdouble t = vdiv_vd_vd_vd(min, max);
  vdouble ret = vmul_vd_vd_vd(max, vsqrt_vd_vd(vmla_vd_vd_vd_vd(t, t, vcast_vd_d(1))));
  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret);
  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret);
  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(SLEEF_INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(SLEEF_INFINITY))), vcast_vd_d(SLEEF_INFINITY), ret);

  return ret;
}

static INLINE CONST VECTOR_CC vdouble vtoward0(vdouble x) { // returns nextafter(x, 0)
  vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i_i(-1, -1)));
  return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);
}

static INLINE CONST VECTOR_CC vdouble vptrunc(vdouble x) { // round to integer toward 0, positive argument only
#ifdef FULL_FP_ROUNDING
  return vtruncate_vd_vd(x);
#else
  vdouble fr = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))), x);
  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
  return vsel_vd_vo_vd_vd(vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), x, vsub_vd_vd_vd(x, fr));
#endif
}

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) {
  vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q;
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n);
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d);
  s  = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (UINT64_C(1) << 54))), s);
  vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0));
  vdouble rd = vtoward0(vrec_vd_vd(d));

  for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52)
    q = vptrunc(vmul_vd_vd_vd(vtoward0(vd2getx_vd_vd2(r)), rd));
#ifndef ENABLE_FMA_DP
    q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe)));
#endif
    q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vmul_vd_vd_vd(vcast_vd_d(3), d), vd2getx_vd_vd2(r)),
				       vge_vo_vd_vd(vd2getx_vd_vd2(r), d)),
			 vcast_vd_d(2), q);
    q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(d, d), vd2getx_vd_vd2(r)),
				       vge_vo_vd_vd(vd2getx_vd_vd2(r), d)),
			 vcast_vd_d(1), q);
    r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d))));
    if (vtestallones_i_vo64(vlt_vo_vd_vd(vd2getx_vd_vd2(r), d))) break;
  }
  
  vdouble ret = vmul_vd_vd_vd(vd2getx_vd_vd2(r), s);
  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(r), vd2gety_vd_vd2(r)), d), vcast_vd_d(0), ret);

  ret = vmulsign_vd_vd_vd(ret, x);

  ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(n, d), x, ret);
  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret);

  return ret;
}

static INLINE VECTOR_CC vdouble vrintk2_vd_vd(vdouble d) {
#ifdef FULL_FP_ROUNDING
  return vrint_vd_vd(d);
#else
  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);
  return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),
			  d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));
#endif
}

EXPORT CONST VECTOR_CC vdouble xremainder(vdouble x, vdouble y) {
  vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q;
  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN*2));
  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n);
  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d);
  s  = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (UINT64_C(1) << 54))), s);
  vdouble rd = vrec_vd_vd(d);
  vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0));
  vopmask qisodd = vneq_vo_vd_vd(vcast_vd_d(0), vcast_vd_d(0));

  for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52)
    q = vrintk2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(r), rd));
#ifndef ENABLE_FMA_DP
    q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe)));
#endif
    q = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(1.5))), vmulsign_vd_vd_vd(vcast_vd_d(1.0), vd2getx_vd_vd2(r)), q);
    q = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(0.5))),
				      vandnot_vo_vo_vo(qisodd, veq_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(0.5))))),
			 vcast_vd_d(0.0), q);
    if (vtestallones_i_vo64(veq_vo_vd_vd(q, vcast_vd_d(0)))) break;
    q = vsel_vd_vo_vd_vd(visinf_vo_vd(vmul_vd_vd_vd(q, vneg_vd_vd(d))), vadd_vd_vd_vd(q, vmulsign_vd_vd_vd(vcast_vd_d(-1), vd2getx_vd_vd2(r))), q);
    qisodd = vxor_vo_vo_vo(qisodd, visodd_vo_vd(q));
    r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d))));
  }
  
  vdouble ret = vmul_vd_vd_vd(vd2getx_vd_vd2(r), s);
  ret = vmulsign_vd_vd_vd(ret, x);
  ret = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsel_vd_vo_vd_vd(visinf_vo_vd(x), vcast_vd_d(SLEEF_NAN), x), ret);
  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret);
  return ret;
}

#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))
  typedef struct {
    vdouble2 a, b;
  } dd2;

static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {
  dd2 r = { a, b };
  return r;
}
static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; }
static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; }
#endif

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
static CONST dd2 gammak(vdouble a) {
  vdouble2 clc = vcast_vd2_d_d(0, 0), clln = vcast_vd2_d_d(1, 0), clld = vcast_vd2_d_d(1, 0);
  vdouble2 v = vcast_vd2_d_d(1, 0), x, y, z;
  vdouble t, u;

  vopmask otiny = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(1e-306)), oref = vlt_vo_vd_vd(a, vcast_vd_d(0.5));

  x = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(0, 0),
			  vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(a)),
					      vcast_vd2_vd_vd(a, vcast_vd_d(0))));

  vopmask o0 = vand_vo_vo_vo(vle_vo_vd_vd(vcast_vd_d(0.5), vd2getx_vd_vd2(x)), vle_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(1.1)));
  vopmask o2 = vle_vo_vd_vd(vcast_vd_d(2.3), vd2getx_vd_vd2(x));
  
  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x));
  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(2)), y));
  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(3)), y));
  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(4)), y));

  vopmask o = vand_vo_vo_vo(o2, vle_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(7)));
  clln = vsel_vd2_vo_vd2_vd2(o, y, clln);

  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(5)), x);
  
  t = vsel_vd_vo_vd_vd(o2, vrec_vd_vd(vd2getx_vd_vd2(x)), vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(x, vsel_vd_vo_d_d(o0, -1, -2)))));

  u = vsel_vd_vo_vo_d_d_d(o2, o0, -156.801412704022726379848862, +0.2947916772827614196e+2, +0.7074816000864609279e-7);
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +1.120804464289911606838558160000, +0.1281459691827820109e+3, +0.4009244333008730443e-6));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +13.39798545514258921833306020000, +0.2617544025784515043e+3, +0.1040114641628246946e-5));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.116546276599463200848033357000, +0.3287022855685790432e+3, +0.1508349150733329167e-5));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -1.391801093265337481495562410000, +0.2818145867730348186e+3, +0.1288143074933901020e-5));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.015056113040026424412918973400, +0.1728670414673559605e+3, +0.4744167749884993937e-6));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.179540117061234856098844714000, +0.7748735764030416817e+2, -0.6554816306542489902e-7));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002481743600264997730942489280, +0.2512856643080930752e+2, -0.3189252471452599844e-6));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.029527880945699120504851034100, +0.5766792106140076868e+1, +0.1358883821470355377e-6));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000540164767892604515196325186, +0.7270275473996180571e+0, -0.4343931277157336040e-6));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.006403362833808069794787256200, +0.8396709124579147809e-1, +0.9724785897406779555e-6));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000162516262783915816896611252, -0.8211558669746804595e-1, -0.2036886057225966011e-5));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.001914438498565477526465972390, +0.6828831828341884458e-1, +0.4373363141819725815e-5));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +7.20489541602001055898311517e-05, -0.7712481339961671511e-1, -0.9439951268304008677e-5));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000839498720672087279971000786, +0.8337492023017314957e-1, +0.2050727030376389804e-4));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -5.17179090826059219329394422e-05, -0.9094964931456242518e-1, -0.4492620183431184018e-4));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000592166437353693882857342347, +0.1000996313575929358e+0, +0.9945751236071875931e-4));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +6.97281375836585777403743539e-05, -0.1113342861544207724e+0, -0.2231547599034983196e-3));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000784039221720066627493314301, +0.1255096673213020875e+0, +0.5096695247101967622e-3));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000229472093621399176949318732, -0.1440498967843054368e+0, -0.1192753911667886971e-2));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002681327160493827160473958490, +0.1695571770041949811e+0, +0.2890510330742210310e-2));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.003472222222222222222175164840, -0.2073855510284092762e+0, -0.7385551028674461858e-2));
  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.083333333333333333335592087900, +0.2705808084277815939e+0, +0.2058080842778455335e-1));

  y = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(-0.5)), logk2(x));
  y = ddadd2_vd2_vd2_vd2(y, ddneg_vd2_vd2(x));
  y = ddadd2_vd2_vd2_vd2(y, vcast_vd2_d_d(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI)

  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd (u, t), vsel_vd_vo_d_d(o0, -0.4006856343865314862e+0, -0.6735230105319810201e-1));
  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, +0.8224670334241132030e+0, +0.3224670334241132030e+0));
  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, -0.5772156649015328655e+0, +0.4227843350984671345e+0));
  z = ddmul_vd2_vd2_vd(z, t);

  clc = vsel_vd2_vo_vd2_vd2(o2, y, z);
  
  clld = vsel_vd2_vo_vd2_vd2(o2, ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(u, t), vcast_vd_d(1)), clld);
  
  y = clln;

  clc = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(83.1776616671934334590333, 3.67103459631568507221878e-15), // log(2^120)
			    vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd2_vd2(vcast_vd2_d_d(1.1447298858494001639, 1.026595116270782638e-17), ddneg_vd2_vd2(clc)), clc)); // log(M_PI)
  clln = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(1, 0), vsel_vd2_vo_vd2_vd2(oref, clln, clld));

  if (!vtestallones_i_vo64(vnot_vo64_vo64(oref))) {
    t = vsub_vd_vd_vd(a, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 28), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(a, vcast_vd_d(1.0 / (INT64_C(1) << 28)))))));
    x = ddmul_vd2_vd2_vd2(clld, sinpik(t));
  }
  
  clld = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_vd_vd(vmul_vd_vd_vd(a, vcast_vd_d((INT64_C(1) << 60)*(double)(INT64_C(1) << 60))), vcast_vd_d(0)),
			     vsel_vd2_vo_vd2_vd2(oref, x, y));

  return dd2setab_dd2_vd2_vd2(clc, dddiv_vd2_vd2_vd2(clln, clld));
}

EXPORT CONST VECTOR_CC vdouble xtgamma_u1(vdouble a) {
  dd2 d = gammak(a);
  vdouble2 y = ddmul_vd2_vd2_vd2(expk2(dd2geta_vd2_dd2(d)), dd2getb_vd2_dd2(d));
  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y));
  vopmask o;

  o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(-SLEEF_INFINITY)),
				vand_vo_vo_vo(vlt_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a))),
		   vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vd(a), vlt_vo_vd_vd(a, vcast_vd_d(0))), visnan_vo_vd(r)));
  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_NAN), r);

  o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(SLEEF_INFINITY)), visnumber_vo_vd(a)),
				  vge_vo_vd_vd(a, vcast_vd_d(-DBL_MIN))),
		    vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(0)), vgt_vo_vd_vd(a, vcast_vd_d(200))), visnan_vo_vd(r)));
  r = vsel_vd_vo_vd_vd(o, vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), a), r);
  
  return r;
}

EXPORT CONST VECTOR_CC vdouble xlgamma_u1(vdouble a) {
  dd2 d = gammak(a);
  vdouble2 y = ddadd2_vd2_vd2_vd2(dd2geta_vd2_dd2(d), logk2(ddabs_vd2_vd2(dd2getb_vd2_dd2(d))));
  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y));
  vopmask o;

  o = vor_vo_vo_vo(visinf_vo_vd(a),
		   vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a)),
				vand_vo_vo_vo(visnumber_vo_vd(a), visnan_vo_vd(r))));
  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_INFINITY), r);

  return r;
}

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
EXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) {
  vdouble s = a, t, u;
  vdouble2 d;

  a = vabs_vd_vd(a);
  vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0));
  vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(3.7));
  vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(6.0));
  u = vsel_vd_vo_vd_vd(o0, vmul_vd_vd_vd(a, a), a);
  
  t = vsel_vd_vo_vo_d_d_d(o0, o1, +0.6801072401395392157e-20, +0.2830954522087717660e-13, -0.5846750404269610493e-17);
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2161766247570056391e-18, -0.1509491946179481940e-11, +0.6076691048812607898e-15));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4695919173301598752e-17, +0.3827857177807173152e-10, -0.3007518609604893831e-13));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.9049140419888010819e-16, -0.6139733921558987241e-09, +0.9427906260824646063e-12));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1634018903557411517e-14, +0.6985387934608038824e-08, -0.2100110908269393629e-10));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2783485786333455216e-13, -0.5988224513034371474e-07, +0.3534639523461223473e-09));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4463221276786412722e-12, +0.4005716952355346640e-06, -0.4664967728285395926e-08));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.6711366622850138987e-11, -0.2132190104575784400e-05, +0.4943823283769000532e-07));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.9422759050232658346e-10, +0.9092461304042630325e-05, -0.4271203394761148254e-06));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1229055530100228477e-08, -0.3079188080966205457e-04, +0.3034067677404915895e-05));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1480719281585085023e-07, +0.7971413443082370762e-04, -0.1776295289066871135e-04));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1636584469123402714e-06, -0.1387853215225442864e-03, +0.8524547630559505050e-04));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1646211436588923363e-05, +0.6469678026257590965e-04, -0.3290582944961784398e-03));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1492565035840624866e-04, +0.4996645280372945860e-03, +0.9696966068789101157e-03));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1205533298178966496e-03, -0.1622802482842520535e-02, -0.1812527628046986137e-02));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.8548327023450851166e-03, +0.1615320557049377171e-03, -0.4725409828123619017e-03));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.5223977625442188799e-02, +0.1915262325574875607e-01, +0.2090315427924229266e-01));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2686617064513125569e-01, -0.1027818298486033455e+00, -0.1052041921842776645e+00));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1128379167095512753e+00, -0.6366172819842503827e+00, -0.6345351808766568347e+00));
  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.3761263890318375380e+00, -0.1128379590648910469e+01, -0.1129442929103524396e+01));
  d = ddmul_vd2_vd_vd(t, u);

  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_d_d_d(o0, o1, 1.1283791670955125586, 3.4110644736196137587e-08, 0.00024963035690526438285),
					    vsel_vd_vo_vo_d_d_d(o0, o1, 1.5335459613165822674e-17, -2.4875650708323294246e-24, -5.4362665034856259795e-21)));
  d = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd2_vd(d, a), ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddneg_vd2_vd2(expk2(d))));

  u = vmulsign_vd_vd_vd(vsel_vd_vo_vd_vd(o2, vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(1)), s);
  u = vsel_vd_vo_vd_vd(visnan_vo_vd(a), vcast_vd_d(SLEEF_NAN), u);

  return u;
}

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
EXPORT CONST VECTOR_CC vdouble xerfc_u15(vdouble a) {
  vdouble s = a, r = vcast_vd_d(0), t;
  vdouble2 u, d, x;
  a = vabs_vd_vd(a);
  vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0));
  vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(2.2));
  vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(4.2));
  vopmask o3 = vlt_vo_vd_vd(a, vcast_vd_d(27.3));

  u = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd_vd(a, a), vsel_vd2_vo_vd2_vd2(o1, vcast_vd2_vd_vd(a, vcast_vd_d(0)), dddiv_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), vcast_vd2_vd_vd(a, vcast_vd_d(0)))));

  t = vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.6801072401395386139e-20, +0.3438010341362585303e-12, -0.5757819536420710449e+2, +0.2334249729638701319e+5);
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2161766247570055669e-18, -0.1237021188160598264e-10, +0.4669289654498104483e+3, -0.4695661044933107769e+5));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4695919173301595670e-17, +0.2117985839877627852e-09, -0.1796329879461355858e+4, +0.3173403108748643353e+5));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.9049140419888007122e-16, -0.2290560929177369506e-08, +0.4355892193699575728e+4, +0.3242982786959573787e+4));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1634018903557410728e-14, +0.1748931621698149538e-07, -0.7456258884965764992e+4, -0.2014717999760347811e+5));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2783485786333451745e-13, -0.9956602606623249195e-07, +0.9553977358167021521e+4, +0.1554006970967118286e+5));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4463221276786415752e-12, +0.4330010240640327080e-06, -0.9470019905444229153e+4, -0.6150874190563554293e+4));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.6711366622850136563e-11, -0.1435050600991763331e-05, +0.7387344321849855078e+4, +0.1240047765634815732e+4));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.9422759050232662223e-10, +0.3460139479650695662e-05, -0.4557713054166382790e+4, -0.8210325475752699731e+2));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1229055530100229098e-08, -0.4988908180632898173e-05, +0.2207866967354055305e+4, +0.3242443880839930870e+2));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1480719281585086512e-07, -0.1308775976326352012e-05, -0.8217975658621754746e+3, -0.2923418863833160586e+2));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1636584469123399803e-06, +0.2825086540850310103e-04, +0.2268659483507917400e+3, +0.3457461732814383071e+0));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1646211436588923575e-05, -0.6393913713069986071e-04, -0.4633361260318560682e+2, +0.5489730155952392998e+1));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1492565035840623511e-04, -0.2566436514695078926e-04, +0.9557380123733945965e+1, +0.1559934132251294134e-2));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1205533298178967851e-03, +0.5895792375659440364e-03, -0.2958429331939661289e+1, -0.1541741566831520638e+1));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.8548327023450850081e-03, -0.1695715579163588598e-02, +0.1670329508092765480e+0, +0.2823152230558364186e-5));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.5223977625442187932e-02, +0.2089116434918055149e-03, +0.6096615680115419211e+0, +0.6249999184195342838e+0));
  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2686617064513125222e-01, +0.1912855949584917753e-01, +0.1059212443193543585e-2, +0.1741749416408701288e-8));

  d = ddmul_vd2_vd2_vd(u, t);
  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 0.11283791670955126141, -0.10277263343147646779, -0.50005180473999022439, -0.5000000000258444377),
					    vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -4.0175691625932118483e-18, -6.2338714083404900225e-18, 2.6362140569041995803e-17, -4.0074044712386992281e-17)));
  d = ddmul_vd2_vd2_vd2(d, u);
  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.37612638903183753802, -0.63661976742916359662, 1.601106273924963368e-06, 2.3761973137523364792e-13),
					    vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.3391897206042552387e-17, 7.6321019159085724662e-18, 1.1974001857764476775e-23, -1.1670076950531026582e-29)));
  d = ddmul_vd2_vd2_vd2(d, u);
  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.1283791670955125586, -1.1283791674717296161, -0.57236496645145429341, -0.57236494292470108114),
					    vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.5335459613165822674e-17, 8.0896847755965377194e-17, 3.0704553245872027258e-17, -2.3984352208056898003e-17)));
  
  x = ddmul_vd2_vd2_vd(vsel_vd2_vo_vd2_vd2(o1, d, vcast_vd2_vd_vd(vneg_vd_vd(a), vcast_vd_d(0))), a);
  x = vsel_vd2_vo_vd2_vd2(o1, x, ddadd2_vd2_vd2_vd2(x, d));
  x = vsel_vd2_vo_vd2_vd2(o0, ddsub_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), x), expk2(x));
  x = vsel_vd2_vo_vd2_vd2(o1, x, ddmul_vd2_vd2_vd2(x, u));

  r = vsel_vd_vo_vd_vd(o3, vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vcast_vd_d(0));
  r = vsel_vd_vo_vd_vd(vsignbit_vo_vd(s), vsub_vd_vd_vd(vcast_vd_d(2), r), r);
  r = vsel_vd_vo_vd_vd(visnan_vo_vd(s), vcast_vd_d(SLEEF_NAN), r);
  return r;
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)
// The normal and deterministic versions of implementations are common
// for the functions like sincospi_u05. Aliases are defined by
// DALIAS_* macros for such functions. The defined aliases
// (e.g. ysincospi_u05) are renamed(e.g. to
// Sleef_cinz_sincospid2_u05sse2) by rename*.h.

#ifdef ENABLE_ALIAS
#define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) )));
#else
#define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble d) { return x ## FUNC (d); }
#define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble d) { return x ## FUNC (d); }
#define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble d) { return x ## FUNC (d); }
#define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y) { return x ## FUNC (x, y); }
#define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y, vdouble z) { return x ## FUNC (x, y, z); }
#endif

/* DALIAS_vd2_vd(sincospi_u05) */
/* DALIAS_vd2_vd(sincospi_u35) */
/* DALIAS_vd2_vd(modf) */
/* DALIAS_vd_vd(log) */
/* DALIAS_vd_vd(log_u1) */
/* DALIAS_vd_vd_vd(pow) */
/* DALIAS_vd_vd(sinh) */
/* DALIAS_vd_vd(cosh) */
/* DALIAS_vd_vd(tanh) */
/* DALIAS_vd_vd(sinh_u35) */
/* DALIAS_vd_vd(cosh_u35) */
/* DALIAS_vd_vd(tanh_u35) */
/* DALIAS_vd_vd(asinh) */
/* DALIAS_vd_vd(acosh) */
/* DALIAS_vd_vd(atanh) */
/* DALIAS_vd_vd(cbrt) */
/* DALIAS_vd_vd(cbrt_u1) */
/* DALIAS_vd_vd(expm1) */
/* DALIAS_vd_vd(log10) */
/* DALIAS_vd_vd(log2) */
/* DALIAS_vd_vd(log2_u35) */
/* DALIAS_vd_vd(log1p) */
/* DALIAS_vd_vd(fabs) */
/* DALIAS_vd_vd_vd(copysign) */
/* DALIAS_vd_vd_vd(fmax) */
/* DALIAS_vd_vd_vd(fmin) */
/* DALIAS_vd_vd_vd(fdim) */
/* DALIAS_vd_vd(trunc) */
/* DALIAS_vd_vd(floor) */
/* DALIAS_vd_vd(ceil) */
/* DALIAS_vd_vd(round) */
/* DALIAS_vd_vd(rint) */
/* DALIAS_vd_vd_vd(nextafter) */
/* DALIAS_vd_vd(frfrexp) */
/* DALIAS_vi_vd(expfrexp) */
/* DALIAS_vd_vd_vd_vd(fma) */
/* DALIAS_vd_vd(sqrt_u05) */
/* DALIAS_vd_vd(sqrt_u35) */
/* DALIAS_vd_vd_vd(hypot_u05) */
/* DALIAS_vd_vd_vd(hypot_u35) */
/* DALIAS_vd_vd_vd(fmod) */
/* DALIAS_vd_vd_vd(remainder) */
/* DALIAS_vd_vd(tgamma_u1) */
/* DALIAS_vd_vd(lgamma_u1) */
/* DALIAS_vd_vd(erf_u1) */
/* DALIAS_vd_vd(erfc_u15) */
#endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)

#if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)
EXPORT CONST int xgetInt(int name) {
  if (1 <= name && name <= 10) return vavailability_i(name);
  return 0;
}

EXPORT CONST void *xgetPtr(int name) {
  if (name == 0) return ISANAME;
  return (void *)0;
}
#endif

#if defined(ALIAS_NO_EXT_SUFFIX) && !defined(DETERMINISTIC)
#include ALIAS_NO_EXT_SUFFIX
#endif

#ifdef ENABLE_MAIN
// gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimddp.c rempitab.c ../common/common.c -lm
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char **argv) {
  vdouble d1 = vcast_vd_d(atof(argv[1]));
  vdouble d2 = vcast_vd_d(atof(argv[2]));
  //vdouble d3 = vcast_vd_d(atof(argv[3]));
  //vdouble r = xnextafter(d1, d2);
  //int i;
  //double fr = frexp(atof(argv[1]), &i);
  //printf("%.20g\n", xfma(d1, d2, d3)[0]);;
  //printf("test %.20g\n", xtgamma_u1(d1)[0]);
  //printf("corr %.20g\n", tgamma(d1[0]));
  //printf("test %.20g\n", xerf_u1(d1)[0]);
  //printf("corr %.20g\n", erf(d1[0]));
  //printf("test %.20g\n", xerfc_u15(d1)[0]);
  //printf("corr %.20g\n", erfc(d1[0]));
  //printf("%.20g\n", nextafter(d1[0], d2[0]));;
  //printf("%.20g\n", vcast_d_vd(xhypot_u05(d1, d2)));
  //printf("%.20g\n", fr);
  printf("%.20g\n", fmod(atof(argv[1]), atof(argv[2])));
  printf("%.20g\n", xfmod(d1, d2)[0]);
  //vdouble2 r = xsincospi_u35(a);
  //printf("%g, %g\n", vcast_d_vd(r.x), vcast_d_vd(r.y));
}
#endif

#ifdef ENABLE_GNUABI
/* "finite" aliases for compatibility with GLIBC */
EXPORT CONST VECTOR_CC vdouble __acos_finite     (vdouble)          __attribute__((weak, alias(str_xacos     )));
EXPORT CONST VECTOR_CC vdouble __acosh_finite    (vdouble)          __attribute__((weak, alias(str_xacosh    )));
EXPORT CONST VECTOR_CC vdouble __asin_finite     (vdouble)          __attribute__((weak, alias(str_xasin_u1  )));
EXPORT CONST VECTOR_CC vdouble __atan2_finite    (vdouble, vdouble) __attribute__((weak, alias(str_xatan2_u1 )));
EXPORT CONST VECTOR_CC vdouble __atanh_finite    (vdouble)          __attribute__((weak, alias(str_xatanh    )));
EXPORT CONST VECTOR_CC vdouble __cosh_finite     (vdouble)          __attribute__((weak, alias(str_xcosh     )));
EXPORT CONST VECTOR_CC vdouble __exp10_finite    (vdouble)          __attribute__((weak, alias(str_xexp10    )));
EXPORT CONST VECTOR_CC vdouble __exp2_finite     (vdouble)          __attribute__((weak, alias(str_xexp2     )));
EXPORT CONST VECTOR_CC vdouble __exp_finite      (vdouble)          __attribute__((weak, alias(str_xexp      )));
EXPORT CONST VECTOR_CC vdouble __fmod_finite     (vdouble, vdouble) __attribute__((weak, alias(str_xfmod     )));
EXPORT CONST VECTOR_CC vdouble __remainder_finite(vdouble, vdouble) __attribute__((weak, alias(str_xremainder)));
EXPORT CONST VECTOR_CC vdouble __modf_finite     (vdouble, vdouble *) __attribute__((weak, alias(str_xmodf   )));
EXPORT CONST VECTOR_CC vdouble __hypot_u05_finite(vdouble, vdouble) __attribute__((weak, alias(str_xhypot_u05)));
EXPORT CONST VECTOR_CC vdouble __lgamma_u1_finite(vdouble)          __attribute__((weak, alias(str_xlgamma_u1)));
EXPORT CONST VECTOR_CC vdouble __log10_finite    (vdouble)          __attribute__((weak, alias(str_xlog10    )));
EXPORT CONST VECTOR_CC vdouble __log_finite      (vdouble)          __attribute__((weak, alias(str_xlog_u1   )));
EXPORT CONST VECTOR_CC vdouble __pow_finite      (vdouble, vdouble) __attribute__((weak, alias(str_xpow      )));
EXPORT CONST VECTOR_CC vdouble __sinh_finite     (vdouble)          __attribute__((weak, alias(str_xsinh     )));
EXPORT CONST VECTOR_CC vdouble __sqrt_finite     (vdouble)          __attribute__((weak, alias(str_xsqrt     )));
EXPORT CONST VECTOR_CC vdouble __tgamma_u1_finite(vdouble)          __attribute__((weak, alias(str_xtgamma_u1)));

#ifdef HEADER_MASKED
#include HEADER_MASKED
#endif
#endif /* #ifdef ENABLE_GNUABI */


================================================
FILE: src/sleefsimddp_emulation.c
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>

#ifdef ENABLE_NEON32
#include "renameneon32.h"
#define nsimd_vec_f64 nsimd_neon128_vf64
#endif

#ifdef ENABLE_VSX
#include "renamevsx.h"
#define nsimd_vec_f64 nsimd_vmx_vf64
#endif


nsimd_vec_f64 xsin(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_sin_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}


nsimd_vec_f64 xcos(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cos_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xtan(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_tan_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xasin(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_asin_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xacos(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_acos_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xatan(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_atan_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xatan2(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_atan2_u35_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlog(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_log_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xcbrt(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cbrt_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xsin_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_sin_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xcos_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cos_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xtan_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_tan_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xasin_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_asin_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xacos_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_acos_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xatan_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_atan_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xatan2_u1(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_atan2_u10_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlog_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_log_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xcbrt_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cbrt_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xexp(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_exp_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xpow(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_pow_u10_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xsinh(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_sinh_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xcosh(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cosh_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xtanh(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_tanh_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xsinh_u35(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_sinh_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xcosh_u35(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cosh_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xtanh_u35(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_tanh_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xasinh(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_asinh_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xacosh(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_acosh_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xatanh(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_atanh_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xexp2(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_exp2_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xexp2_u35(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_exp2_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xexp10(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_exp10_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xexp10_u35(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_exp10_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xexpm1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_expm1_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlog10(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_log10_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlog2(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_log2_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlog2_u35(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_log2_u35_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlog1p(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_log1p_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xsinpi_u05(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_sinpi_u05_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xcospi_u05(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_cospi_u05_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xhypot_u05(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_hypot_u05_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xhypot_u35(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_hypot_u35_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xfmod(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_fmod_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xremainder(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, a1, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  a1.v0 = a1_.v0;
  a1.v1 = a1_.v1;
  ret = nsimd_remainder_cpu_f64(a0, a1);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xlgamma_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_lgamma_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xtgamma_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_tgamma_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xerf_u1(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_erf_u10_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}

nsimd_vec_f64 xerfc_u15(nsimd_vec_f64 a0_) {
  nsimd_vec_f64 ret_;
  nsimd_cpu_vf64 a0, ret;
  a0.v0 = a0_.v0;
  a0.v1 = a0_.v1;
  ret = nsimd_erfc_u15_cpu_f64(a0);
  ret_.v0 = ret.v0;
  ret_.v1 = ret.v1;
  return ret_;
}


================================================
FILE: src/sleefsimdsp.c
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

// Always use -ffp-contract=off option to compile SLEEF.

#if !defined(SLEEF_GENHEADER)
#include <stdint.h>
#include <assert.h>
#include <limits.h>
#include <float.h>
#endif

#include "misc.h"

extern const float Sleef_rempitabsp[];

#define __SLEEFSIMDSP_C__

#if (defined(_MSC_VER))
#pragma fp_contract (off)
#endif

// Intel

#ifdef ENABLE_SSE2
#define CONFIG 2
#if !defined(SLEEF_GENHEADER)
#include "helpersse2.h"
#else
#include "macroonlySSE2.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renamesse2_gnuabi.h"
#else
#include "renamesse2.h"
#endif
#endif
#endif

#ifdef ENABLE_SSE4
#define CONFIG 4
#if !defined(SLEEF_GENHEADER)
#include "helpersse2.h"
#else
#include "macroonlySSE4.h"
#endif
#ifdef DORENAME
#include "renamesse4.h"
#endif
#endif

#ifdef ENABLE_AVX
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperavx.h"
#else
#include "macroonlyAVX.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameavx_gnuabi.h"
#else
#include "renameavx.h"
#endif
#endif
#endif

#ifdef ENABLE_FMA4
#define CONFIG 4
#if !defined(SLEEF_GENHEADER)
#include "helperavx.h"
#else
#include "macroonlyFMA4.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renamefma4_gnuabi.h"
#else
#include "renamefma4.h"
#endif
#endif
#endif

#ifdef ENABLE_AVX2
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperavx2.h"
#else
#include "macroonlyAVX2.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameavx2_gnuabi.h"
#else
#include "renameavx2.h"
#endif
#endif
#endif

#ifdef ENABLE_AVX2128
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperavx2_128.h"
#else
#include "macroonlyAVX2128.h"
#endif
#ifdef DORENAME
#include "renameavx2128.h"
#endif
#endif

#ifdef ENABLE_AVX512F
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperavx512f.h"
#else
#include "macroonlyAVX512F.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameavx512f_gnuabi.h"
#else
#include "renameavx512f.h"
#endif
#endif
#endif

#ifdef ENABLE_AVX512FNOFMA
#define CONFIG 2
#if !defined(SLEEF_GENHEADER)
#include "helperavx512f.h"
#else
#include "macroonlyAVX512FNOFMA.h"
#endif
#ifdef DORENAME
#include "renameavx512fnofma.h"
#endif
#endif

// Arm

#ifdef ENABLE_ADVSIMD
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperadvsimd.h"
#else
#include "macroonlyADVSIMD.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renameadvsimd_gnuabi.h"
#else
#include "renameadvsimd.h"
#endif
#endif
#endif

#ifdef ENABLE_ADVSIMDNOFMA
#define CONFIG 2
#if !defined(SLEEF_GENHEADER)
#include "helperadvsimd.h"
#else
#include "macroonlyADVSIMDNOFMA.h"
#endif
#ifdef DORENAME
#include "renameadvsimdnofma.h"
#endif
#endif

#ifdef ENABLE_NEON32
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperneon32.h"
#endif
#ifdef DORENAME
#include "renameneon32.h"
#endif
#endif

#ifdef ENABLE_NEON32VFPV4
#define CONFIG 4
#if !defined(SLEEF_GENHEADER)
#include "helperneon32.h"
#endif
#ifdef DORENAME
#include "renameneon32vfpv4.h"
#endif
#endif

#ifdef ENABLE_SVE
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helpersve.h"
#else
#include "macroonlySVE.h"
#endif
#ifdef DORENAME
#ifdef ENABLE_GNUABI
#include "renamesve_gnuabi.h"
#else
#include "renamesve.h"
#endif /* ENABLE_GNUABI */
#endif /* DORENAME */
#endif /* ENABLE_SVE */

#ifdef ENABLE_SVENOFMA
#define CONFIG 2
#if !defined(SLEEF_GENHEADER)
#include "helpersve.h"
#else
#include "macroonlySVENOFMA.h"
#endif
#ifdef DORENAME
#include "renamesvenofma.h"
#endif /* DORENAME */
#endif /* ENABLE_SVE */

// IBM

#ifdef ENABLE_VSX
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperpower_128.h"
#else
#include "macroonlyVSX.h"
#endif
#ifdef DORENAME
#include "renamevsx.h"
#endif
#endif

#ifdef ENABLE_VSXNOFMA
#define CONFIG 2
#if !defined(SLEEF_GENHEADER)
#include "helperpower_128.h"
#else
#include "macroonlyVSXNOFMA.h"
#endif
#ifdef DORENAME
#include "renamevsxnofma.h"
#endif
#endif

#ifdef ENABLE_ZVECTOR2
#define CONFIG 140
#if !defined(SLEEF_GENHEADER)
#include "helpers390x_128.h"
#else
#include "macroonlyZVECTOR2.h"
#endif
#ifdef DORENAME
#include "renamezvector2.h"
#endif
#endif

#ifdef ENABLE_ZVECTOR2NOFMA
#define CONFIG 141
#if !defined(SLEEF_GENHEADER)
#include "helpers390x_128.h"
#else
#include "macroonlyZVECTOR2NOFMA.h"
#endif
#ifdef DORENAME
#include "renamezvector2nofma.h"
#endif
#endif

// Generic

#ifdef ENABLE_VECEXT
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helpervecext.h"
#endif
#ifdef DORENAME
#include "renamevecext.h"
#endif
#endif

#ifdef ENABLE_PUREC
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperpurec.h"
#endif
#ifdef DORENAME
#include "renamepurec.h"
#endif
#endif

#ifdef ENABLE_PUREC_SCALAR
#define CONFIG 1
#if !defined(SLEEF_GENHEADER)
#include "helperpurec_scalar.h"
#else
#include "macroonlyPUREC_SCALAR.h"
#endif
#ifdef DORENAME
#include "renamepurec_scalar.h"
#endif
#endif

#ifdef ENABLE_PURECFMA_SCALAR
#define CONFIG 2
#if !defined(SLEEF_GENHEADER)
#include "helperpurec_scalar.h"
#else
#include "macroonlyPURECFMA_SCALAR.h"
#endif
#ifdef DORENAME
#include "renamepurecfma_scalar.h"
#endif
#endif

//

#define MLA(x, y, z) vmla_vf_vf_vf_vf((x), (y), (z))
#define C2V(c) vcast_vf_f(c)
#include "estrin.h"

//

#include "df.h"

static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vf(vfloat d) {
  return veq_vo_vi2_vi2(vreinterpret_vi2_vf(d), vreinterpret_vi2_vf(vcast_vf_f(-0.0)));
}

static INLINE VECTOR_CC vopmask vnot_vo32_vo32(vopmask x) {
  return vxor_vo_vo_vo(x, veq_vo_vi2_vi2(vcast_vi2_i(0), vcast_vi2_i(0)));
}

static INLINE CONST VECTOR_CC vmask vsignbit_vm_vf(vfloat f) {
  return vand_vm_vm_vm(vreinterpret_vm_vf(f), vreinterpret_vm_vf(vcast_vf_f(-0.0f)));
}

static INLINE CONST VECTOR_CC vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) {
  return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y)));
}

static INLINE CONST VECTOR_CC vfloat vcopysign_vf_vf_vf(vfloat x, vfloat y) {
  return vreinterpret_vf_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(x)), 
					  vand_vm_vm_vm   (vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(y))));
}

static INLINE CONST VECTOR_CC vfloat vsign_vf_vf(vfloat f) {
  return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(1.0f)), vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))));
}

static INLINE CONST VECTOR_CC vopmask vsignbit_vo_vf(vfloat d) {
  return veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0x80000000)), vcast_vi2_i(0x80000000));
}

static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) {
  return vsel_vi2_vo_vi2_vi2(vlt_vo_vf_vf(f0, f1), x, y);
}

static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vi2(vfloat d, vint2 x) {
  return vand_vi2_vo_vi2(vsignbit_vo_vf(d), x);
}

static INLINE CONST VECTOR_CC vopmask visint_vo_vf(vfloat y) { return veq_vo_vf_vf(vtruncate_vf_vf(y), y); }

static INLINE CONST VECTOR_CC vopmask visnumber_vo_vf(vfloat x) { return vnot_vo32_vo32(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x))); }

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
static INLINE CONST VECTOR_CC vint2 vilogbk_vi2_vf(vfloat d) {
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.421010862427522E-20f));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d);
  vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 23), vcast_vi2_i(0xff));
  q = vsub_vi2_vi2_vi2(q, vsel_vi2_vo_vi2_vi2(o, vcast_vi2_i(64 + 0x7f), vcast_vi2_i(0x7f)));
  return q;
}

static INLINE CONST VECTOR_CC vint2 vilogb2k_vi2_vf(vfloat d) {
  vint2 q = vreinterpret_vi2_vf(d);
  q = vsrl_vi2_vi2_i(q, 23);
  q = vand_vi2_vi2_vi2(q, vcast_vi2_i(0xff));
  q = vsub_vi2_vi2_vi2(q, vcast_vi2_i(0x7f));
  return q;
}
#endif

//

EXPORT CONST VECTOR_CC vint2 xilogbf(vfloat d) {
  vint2 e = vilogbk_vi2_vf(vabs_vf_vf(d));
  e = vsel_vi2_vo_vi2_vi2(veq_vo_vf_vf(d, vcast_vf_f(0.0f)), vcast_vi2_i(SLEEF_FP_ILOGB0), e);
  e = vsel_vi2_vo_vi2_vi2(visnan_vo_vf(d), vcast_vi2_i(SLEEF_FP_ILOGBNAN), e);
  e = vsel_vi2_vo_vi2_vi2(visinf_vo_vf(d), vcast_vi2_i(INT_MAX), e);
  return e;
}

static INLINE CONST VECTOR_CC vfloat vpow2i_vf_vi2(vint2 q) {
  return vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23));
}

static INLINE CONST VECTOR_CC vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) {
  vfloat u;
  vint2 m = vsra_vi2_vi2_i(q, 31);
  m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4);
  q = vsub_vi2_vi2_vi2(q, vsll_vi2_vi2_i(m, 2));
  m = vadd_vi2_vi2_vi2(m, vcast_vi2_i(0x7f));
  m = vand_vi2_vi2_vi2(vgt_vi2_vi2_vi2(m, vcast_vi2_i(0)), m);
  vint2 n = vgt_vi2_vi2_vi2(m, vcast_vi2_i(0xff));
  m = vor_vi2_vi2_vi2(vandnot_vi2_vi2_vi2(n, m), vand_vi2_vi2_vi2(n, vcast_vi2_i(0xff)));
  u = vreinterpret_vf_vi2(vsll_vi2_vi2_i(m, 23));
  x = vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(x, u), u), u), u);
  u = vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23));
  return vmul_vf_vf_vf(x, u);
}

static INLINE CONST VECTOR_CC vfloat vldexp2_vf_vf_vi2(vfloat d, vint2 e) {
  return vmul_vf_vf_vf(vmul_vf_vf_vf(d, vpow2i_vf_vi2(vsra_vi2_vi2_i(e, 1))), vpow2i_vf_vi2(vsub_vi2_vi2_vi2(e, vsra_vi2_vi2_i(e, 1))));
}

static INLINE CONST VECTOR_CC vfloat vldexp3_vf_vf_vi2(vfloat d, vint2 q) {
  return vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vsll_vi2_vi2_i(q, 23)));
}

EXPORT CONST VECTOR_CC vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); }

#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))
typedef struct {
  vfloat d;
  vint2 i;
} fi_t;

static vfloat figetd_vf_di(fi_t d) { return d.d; }
static vint2 figeti_vi2_di(fi_t d) { return d.i; }
static fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) {
  fi_t r = { d, i };
  return r;
}

typedef struct {
  vfloat2 df;
  vint2 i;
} dfi_t;

static vfloat2 dfigetdf_vf2_dfi(dfi_t d) { return d.df; }
static vint2 dfigeti_vi2_dfi(dfi_t d) { return d.i; }
static dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) {
  dfi_t r = { v, i };
  return r;
}
static dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) {
  dfi.df = v;
  return dfi;
}
#endif

static INLINE CONST VECTOR_CC vfloat vorsign_vf_vf_vf(vfloat x, vfloat y) {
  return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y)));
}

static INLINE CONST fi_t rempisubf(vfloat x) {
#ifdef FULL_FP_ROUNDING
  vfloat y = vrint_vf_vf(vmul_vf_vf_vf(x, vcast_vf_f(4)));
  vint2 vi = vtruncate_vi2_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vrint_vf_vf(x), vcast_vf_f(4))));
  return fisetdi_fi_vf_vi2(vsub_vf_vf_vf(x, vmul_vf_vf_vf(y, vcast_vf_f(0.25))), vi);
#else
  vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), x);
  vfloat rint4x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(vmul_vf_vf_vf(vcast_vf_f(4), x)), vcast_vf_f(1 << 23)),
				   vmul_vf_vf_vf(vcast_vf_f(4), x),
				   vorsign_vf_vf_vf(vsub_vf_vf_vf(vmla_vf_vf_vf_vf(vcast_vf_f(4), x, c), c), x));
  vfloat rintx  = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1 << 23)),
				   x, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(x, c), c), x));
  return fisetdi_fi_vf_vi2(vmla_vf_vf_vf_vf(vcast_vf_f(-0.25), rint4x, x),
			   vtruncate_vi2_vf(vmla_vf_vf_vf_vf(vcast_vf_f(-4), rintx, rint4x)));
#endif
}

static INLINE CONST dfi_t rempif(vfloat a) {
  vfloat2 x, y, z;
  vint2 ex = vilogb2k_vi2_vf(a);
#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  ex = vandnot_vi2_vi2_vi2(vsra_vi2_vi2_i(ex, 31), ex);
  ex = vand_vi2_vi2_vi2(ex, vcast_vi2_i(127));
#endif
  ex = vsub_vi2_vi2_vi2(ex, vcast_vi2_i(25));
  vint2 q = vand_vi2_vo_vi2(vgt_vo_vi2_vi2(ex, vcast_vi2_i(90-25)), vcast_vi2_i(-64));
  a = vldexp3_vf_vf_vi2(a, q);
  ex = vandnot_vi2_vi2_vi2(vsra_vi2_vi2_i(ex, 31), ex);
  ex = vsll_vi2_vi2_i(ex, 2);
  x = dfmul_vf2_vf_vf(a, vgather_vf_p_vi2(Sleef_rempitabsp, ex));
  fi_t di = rempisubf(vf2getx_vf_vf2(x));
  q = figeti_vi2_di(di);
  x = vf2setx_vf2_vf2_vf(x, figetd_vf_di(di));
  x = dfnormalize_vf2_vf2(x);
  y = dfmul_vf2_vf_vf(a, vgather_vf_p_vi2(Sleef_rempitabsp+1, ex));
  x = dfadd2_vf2_vf2_vf2(x, y);
  di = rempisubf(vf2getx_vf_vf2(x));
  q = vadd_vi2_vi2_vi2(q, figeti_vi2_di(di));
  x = vf2setx_vf2_vf2_vf(x, figetd_vf_di(di));
  x = dfnormalize_vf2_vf2(x);
  y = vcast_vf2_vf_vf(vgather_vf_p_vi2(Sleef_rempitabsp+2, ex), vgather_vf_p_vi2(Sleef_rempitabsp+3, ex));
  y = dfmul_vf2_vf2_vf(y, a);
  x = dfadd2_vf2_vf2_vf2(x, y);
  x = dfnormalize_vf2_vf2(x);
  x = dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(3.1415927410125732422f*2, -8.7422776573475857731e-08f*2));
  x = vsel_vf2_vo_vf2_vf2(vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(0.7f)), vcast_vf2_vf_vf(a, vcast_vf_f(0)), x);
  return dfisetdfi_dfi_vf2_vi2(x, q);
}

EXPORT CONST VECTOR_CC vfloat xsinf(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vfloat u, s, r = d;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));
    u = vcast_vf_vi2(q);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f), d);
  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {
    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));
    u = vcast_vf_vi2(q);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df), d);
  } else {
    dfi_t dfi = rempif(d);
    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));
    q = vsra_vi2_vi2_i(q, 2);
    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));
    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), 
				vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));
    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
    d = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));

    d = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(d)));
  }

  s = vmul_vf_vf_vf(d, d);

  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));

  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);

  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(r), r, u);

  return u;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vfloat u, s, r = d;

  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));
  u = vcast_vf_vi2(q);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f), d);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f), d);
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    s = vcast_vf_vi2(q);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af), r);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf), u);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf), u);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df), u);

    d = vsel_vf_vo_vf_vf(g, d, u);
    g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf));

    if (!LIKELY(vtestallones_i_vo32(g))) {
      dfi_t dfi = rempif(r);
      vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
      q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));
      q2 = vsra_vi2_vi2_i(q2, 2);
      vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));
      vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), 
				  vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));
      x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
      dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
      u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));

      u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(u)));

      q = vsel_vi2_vo_vi2_vi2(g, q, q2);
      d = vsel_vf_vo_vf_vf(g, d, u);
    }
  }

  s = vmul_vf_vf_vf(d, d);

  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));

  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);

  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(r), r, u);

  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vfloat xcosf(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vfloat u, s, r = d;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));
    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));

    u = vcast_vf_vi2(q);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), d);
  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {
    q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));
    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));

    u = vcast_vf_vi2(q);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), d);
    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), d);
  } else {
    dfi_t dfi = rempif(d);
    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));
    q = vsra_vi2_vi2_i(q, 1);
    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));
    vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));
    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),
				vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));
    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
    d = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));

    d = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(d)));
  }

  s = vmul_vf_vf_vf(d, d);

  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));

  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);

  return u;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vfloat u, s, r = d;

  q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));
  q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));
  u = vcast_vf_vi2(q);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), d);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), d);
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    s = vcast_vf_vi2(q);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af*0.5f), r);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf*0.5f), u);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf*0.5f), u);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df*0.5f), u);

    d = vsel_vf_vo_vf_vf(g, d, u);
    g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf));

    if (!LIKELY(vtestallones_i_vo32(g))) {
      dfi_t dfi = rempif(r);
      vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
      q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));
      q2 = vsra_vi2_vi2_i(q2, 1);
      vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));
      vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));
      vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),
				  vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));
      x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
      dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
      u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));

      u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(u)));

      q = vsel_vi2_vo_vi2_vi2(g, q, q2);
      d = vsel_vf_vo_vf_vf(g, d, u);
    }
  }

  s = vmul_vf_vf_vf(d, d);

  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));

  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);

  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vopmask o;
  vfloat u, s, x;

  x = d;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f))))) {
    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));
    u = vcast_vf_vi2(q);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), x);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x);
  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {
    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));
    u = vcast_vf_vi2(q);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), x);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), x);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), x);
    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), x);
  } else {
    dfi_t dfi = rempif(d);
    q = dfigeti_vi2_dfi(dfi);
    x = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));
    x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(x)));
    x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, x);
  }

  s = vmul_vf_vf_vf(x, x);

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
  x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x)));

#if defined(ENABLE_NEON32)
  u = vcast_vf_f(0.00927245803177356719970703f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f));
#else
  vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2);
  u = POLY6(s, s2, s4,
	    0.00927245803177356719970703f,
	    0.00331984995864331722259521f,
	    0.0242998078465461730957031f,
	    0.0534495301544666290283203f,
	    0.133383005857467651367188f,
	    0.333331853151321411132812f);
#endif

  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x);

  u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u);

  return u;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vopmask o;
  vfloat u, s, x;

  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));
  u = vcast_vf_vi2(q);
  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x);
  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x);
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    vint2 q2 = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));
    s = vcast_vf_vi2(q);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af*0.5f), d);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf*0.5f), u);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf*0.5f), u);
    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df*0.5f), u);

    q = vsel_vi2_vo_vi2_vi2(g, q, q2);
    x = vsel_vf_vo_vf_vf(g, x, u);
    g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));

    if (!LIKELY(vtestallones_i_vo32(g))) {
      dfi_t dfi = rempif(d);
      u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));
      u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(u)));
      u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);
      q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));
      x = vsel_vf_vo_vf_vf(g, x, u);
    }
  }

  s = vmul_vf_vf_vf(x, x);

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
  x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x)));

#if defined(ENABLE_NEON32)
  u = vcast_vf_f(0.00927245803177356719970703f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f));
#else
  vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2);
  u = POLY6(s, s2, s4,
	    0.00927245803177356719970703f,
	    0.00331984995864331722259521f,
	    0.0242998078465461730957031f,
	    0.0534495301544666290283203f,
	    0.133383005857467651367188f,
	    0.333331853151321411132812f);
#endif

  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x);

  u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u);

  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vfloat xsinf_u1(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vfloat u, v;
  vfloat2 s, t, x;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI)));
    q = vrint_vi2_vf(u);
    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);
    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f)));
    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f)));
  } else {
    dfi_t dfi = rempif(d);
    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));
    q = vsra_vi2_vi2_i(q, 2);
    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));
    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), 
				vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));
    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
    s = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));

#if !defined(_MSC_VER)
    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));
#else
    s.x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s.x)));
#endif
  }

  t = s;
  s = dfsqu_vf2_vf2(s);

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));

  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));

  u = dfmul_vf_vf2_vf2(t, x);

  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));

  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);

  return u;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vfloat u, v;
  vfloat2 s, t, x;

  u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI)));
  q = vrint_vi2_vf(u);
  v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);
  s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f)));
  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f)));
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    dfi_t dfi = rempif(d);
    vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
    q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));
    q2 = vsra_vi2_vi2_i(q2, 2);
    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));
    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), 
				vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));
    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
    t = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));

    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));

    q = vsel_vi2_vo_vi2_vi2(g, q, q2);
    s = vsel_vf2_vo_vf2_vf2(g, s, t);
  }

  t = s;
  s = dfsqu_vf2_vf2(s);

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));

  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));

  u = dfmul_vf_vf2_vf2(t, x);

  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));

  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);

  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vfloat xcosf_u1(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vfloat u;
  vfloat2 s, t, x;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))),
				 vcast_vf_f(2), vcast_vf_f(1));
    q = vrint_vi2_vf(dq);
    s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f)));
    s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f)));
    s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f)));
  } else {
    dfi_t dfi = rempif(d);
    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));
    q = vsra_vi2_vi2_i(q, 1);
    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));
    vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));
    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),
				vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));
    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
    s = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));

#if !defined(_MSC_VER)
    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));
#else
    s.x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s.x)));
#endif
  }

  t = s;
  s = dfsqu_vf2_vf2(s);

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));

  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));

  u = dfmul_vf_vf2_vf2(t, x);

  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));
  
  return u;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vfloat u;
  vfloat2 s, t, x;

  vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))),
			       vcast_vf_f(2), vcast_vf_f(1));
  q = vrint_vi2_vf(dq);
  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f)));
  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f)));
  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f)));
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    dfi_t dfi = rempif(d);
    vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));
    q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));
    q2 = vsra_vi2_vi2_i(q2, 1);
    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));
    vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));
    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),
				vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));
    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);
    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));
    t = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));

    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));

    q = vsel_vi2_vo_vi2_vi2(g, q, q2);
    s = vsel_vf2_vo_vf2_vf2(g, s, t);
  }

  t = s;
  s = dfsqu_vf2_vf2(s);

  u = vcast_vf_f(2.6083159809786593541503e-06f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));

  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));

  u = dfmul_vf_vf2_vf2(t, x);

  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

EXPORT CONST VECTOR_CC vfloat xfastsinf_u3500(vfloat d) {
  vint2 q;
  vfloat u, s, t = d;

  s = vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI));
  u = vrint_vf_vf(s);
  q = vrint_vi2_vf(s);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-(float)M_PI), d);

  s = vmul_vf_vf_vf(d, d);

  u = vcast_vf_f(-0.1881748176e-3);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.8323502727e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.1666651368e+0));
  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, d), u, d);

  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(u)));

  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(t), vcast_vf_f(30.0f));
  if (!LIKELY(vtestallones_i_vo32(g))) return vsel_vf_vo_vf_vf(g, u, xsinf(t));

  return u;
}

EXPORT CONST VECTOR_CC vfloat xfastcosf_u3500(vfloat d) {
  vint2 q;
  vfloat u, s, t = d;

  s = vmla_vf_vf_vf_vf(d, vcast_vf_f((float)M_1_PI), vcast_vf_f(-0.5f));
  u = vrint_vf_vf(s);
  q = vrint_vi2_vf(s);
  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-(float)M_PI), vsub_vf_vf_vf(d, vcast_vf_f((float)M_PI * 0.5f)));

  s = vmul_vf_vf_vf(d, d);

  u = vcast_vf_f(-0.1881748176e-3);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.8323502727e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.1666651368e+0));
  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, d), u, d);

  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(u)));

  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(t), vcast_vf_f(30.0f));
  if (!LIKELY(vtestallones_i_vo32(g))) return vsel_vf_vo_vf_vf(g, u, xcosf(t));

  return u;
}

#ifdef ENABLE_GNUABI
#define TYPE2_FUNCATR static INLINE CONST 
#define TYPE6_FUNCATR static INLINE CONST 
#define SQRTFU05_FUNCATR static INLINE CONST 
#define XSINCOSF sincosfk
#define XSINCOSF_U1 sincosfk_u1
#define XSINCOSPIF_U05 sincospifk_u05
#define XSINCOSPIF_U35 sincospifk_u35
#define XMODFF modffk
#else
#define TYPE2_FUNCATR EXPORT CONST
#define TYPE6_FUNCATR EXPORT
#define SQRTFU05_FUNCATR EXPORT
#define XSINCOSF xsincosf
#define XSINCOSF_U1 xsincosf_u1
#define XSINCOSPIF_U05 xsincospif_u05
#define XSINCOSPIF_U35 xsincospif_u35
#define XMODFF xmodff
#endif

TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vopmask o;
  vfloat u, s, t, rx, ry;
  vfloat2 r;

  s = d;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));
    u = vcast_vf_vi2(q);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), s);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), s);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), s);
  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {
    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));
    u = vcast_vf_vi2(q);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), s);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), s);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), s);
    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), s);
  } else {
    dfi_t dfi = rempif(d);
    q = dfigeti_vi2_dfi(dfi);
    s = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));
    s = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s)));
  }

  t = s;

  s = vmul_vf_vf_vf(s, s);

  u = vcast_vf_f(-0.000195169282960705459117889f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f));

  rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t);
  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);

  u = vcast_vf_f(-2.71811842367242206819355e-07f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5));

  ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  return r;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vopmask o;
  vfloat u, s, t, rx, ry;
  vfloat2 r;

  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));
  u = vcast_vf_vi2(q);
  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), s);
  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), s);
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    vint2 q2 = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));
    u = vcast_vf_vi2(q2);
    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d);
    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), t);
    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), t);
    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), t);

    q = vsel_vi2_vo_vi2_vi2(g, q, q2);
    s = vsel_vf_vo_vf_vf(g, s, t);
    g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));

    if (!LIKELY(vtestallones_i_vo32(g))) {
      dfi_t dfi = rempif(d);
      t = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));
      t = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(t)));

      q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));
      s = vsel_vf_vo_vf_vf(g, s, t);
    }
  }

  t = s;

  s = vmul_vf_vf_vf(s, s);

  u = vcast_vf_f(-0.000195169282960705459117889f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f));

  rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t);
  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);

  u = vcast_vf_f(-2.71811842367242206819355e-07f);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5));

  ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  return r;
#endif // #if !defined(DETERMINISTIC)
}

TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF_U1(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vopmask o;
  vfloat u, v, rx, ry;
  vfloat2 r, s, t, x;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
    q = vrint_vi2_vf(u);
    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));
    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));
  } else {
    dfi_t dfi = rempif(d);
    q = dfigeti_vi2_dfi(dfi);
    s = dfigetdf_vf2_dfi(dfi);
    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));
    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));
  }

  t = s;

  s = vf2setx_vf2_vf2_vf(s, dfsqu_vf_vf2(s));

  u = vcast_vf_f(-0.000195169282960705459117889f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833215750753879547119141f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.166666537523269653320312f));

  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(vf2getx_vf_vf2(s), vf2getx_vf_vf2(t)));

  x = dfadd_vf2_vf2_vf(t, u);
  rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);

  u = vcast_vf_f(-2.71811842367242206819355e-07f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(2.47990446951007470488548e-05f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.00138888787478208541870117f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416666641831398010253906f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.5));

  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(vf2getx_vf_vf2(s), u));
  ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  return r;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vopmask o;
  vfloat u, v, rx, ry;
  vfloat2 r, s, t, x;

  u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
  q = vrint_vi2_vf(u);
  v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
  s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));
  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    dfi_t dfi = rempif(d);
    t = dfigetdf_vf2_dfi(dfi);
    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));
    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));
    q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));
    s = vsel_vf2_vo_vf2_vf2(g, s, t);
  }

  t = s;

  s = vf2setx_vf2_vf2_vf(s, dfsqu_vf_vf2(s));

  u = vcast_vf_f(-0.000195169282960705459117889f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833215750753879547119141f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.166666537523269653320312f));

  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(vf2getx_vf_vf2(s), vf2getx_vf_vf2(t)));

  x = dfadd_vf2_vf2_vf(t, u);
  rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);

  u = vcast_vf_f(-2.71811842367242206819355e-07f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(2.47990446951007470488548e-05f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.00138888787478208541870117f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416666641831398010253906f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.5));

  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(vf2getx_vf_vf2(s), u));
  ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  return r;
#endif // #if !defined(DETERMINISTIC)
}

#if !defined(DETERMINISTIC)
TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U05(vfloat d) {
  vopmask o;
  vfloat u, s, t, rx, ry;
  vfloat2 r, x, s2;

  u = vmul_vf_vf_vf(d, vcast_vf_f(4));
  vint2 q = vtruncate_vi2_vf(u);
  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));

  t = s;
  s = vmul_vf_vf_vf(s, s);
  s2 = dfmul_vf2_vf_vf(t, t);
  
  //

  u = vcast_vf_f(+0.3093842054e-6);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3657307388e-4));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490393585e-2));
  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(-0.080745510756969451904, -1.3373665339076936258e-09));
  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(0.78539818525314331055, -2.1857338617566484855e-08));

  x = dfmul_vf2_vf2_vf(x, t);
  rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);
  
  //
  
  u = vcast_vf_f(-0.2430611801e-7);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.3590577080e-5));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259917721e-3));
  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(0.015854343771934509277, 4.4940051354032242811e-10));
  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(-0.30842512845993041992, -9.0728339030733922277e-09));

  x = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x, s2), vcast_vf_f(1));
  ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  //

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));
  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1e+7f));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));
  
  o = visinf_vo_vf(d);
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  return r;
}

TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U35(vfloat d) {
  vopmask o;
  vfloat u, s, t, rx, ry;
  vfloat2 r;

  u = vmul_vf_vf_vf(d, vcast_vf_f(4));
  vint2 q = vtruncate_vi2_vf(u);
  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));

  t = s;
  s = vmul_vf_vf_vf(s, s);
  
  //

  u = vcast_vf_f(-0.3600925265e-4);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490088111e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.8074551076e-1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.7853981853e+0));

  rx = vmul_vf_vf_vf(u, t);

  //
  
  u = vcast_vf_f(+0.3539815225e-5);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259574005e-3));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1585431583e-1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3084251285e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(1));

  ry = u;

  //

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));
  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1e+7f));
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));
  
  o = visinf_vo_vf(d);
  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));
  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));

  return r;
}

TYPE6_FUNCATR VECTOR_CC vfloat2 XMODFF(vfloat x) {
  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
  fr = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), vcast_vf_f(0), fr);

  vfloat2 ret;

  ret = vf2setxy_vf2_vf_vf(vcopysign_vf_vf_vf(fr, x), vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));

  return ret;
}

#ifdef ENABLE_GNUABI
EXPORT VECTOR_CC void xsincosf(vfloat a, float *ps, float *pc) {
  vfloat2 r = sincosfk(a);
  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));
  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));
}

EXPORT VECTOR_CC void xsincosf_u1(vfloat a, float *ps, float *pc) {
  vfloat2 r = sincosfk_u1(a);
  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));
  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));
}

EXPORT VECTOR_CC void xsincospif_u05(vfloat a, float *ps, float *pc) {
  vfloat2 r = sincospifk_u05(a);
  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));
  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));
}

EXPORT VECTOR_CC void xsincospif_u35(vfloat a, float *ps, float *pc) {
  vfloat2 r = sincospifk_u35(a);
  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));
  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));
}

EXPORT CONST VECTOR_CC vfloat xmodff(vfloat a, float *iptr) {
  vfloat2 r = modffk(a);
  vstoreu_v_p_vf(iptr, vf2gety_vf_vf2(r));
  return vf2getx_vf_vf2(r);
}
#endif // #ifdef ENABLE_GNUABI
#endif // #if !defined(DETERMINISTIC)

EXPORT CONST VECTOR_CC vfloat xtanf_u1(vfloat d) {
#if !defined(DETERMINISTIC)
  vint2 q;
  vfloat u, v;
  vfloat2 s, t, x;
  vopmask o;

  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {
    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
    q = vrint_vi2_vf(u);
    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));
    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));
  } else {
    dfi_t dfi = rempif(d);
    q = dfigeti_vi2_dfi(dfi);
    s = dfigetdf_vf2_dfi(dfi);
    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));
    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));
    s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(s)))));
  }

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
  vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0)));
#if !defined(_MSC_VER)
  s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(s)), n)));
  s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(s)), n)));
#else
  s.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.x), n));
  s.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.y), n));
#endif

  t = s;
  s = dfsqu_vf2_vf2(s);
  s = dfnormalize_vf2_vf2(s);

  u = vcast_vf_f(0.00446636462584137916564941f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-8.3920182078145444393158e-05f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0109639242291450500488281f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0212360303848981857299805f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0540687143802642822265625f));

  x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s)));
  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s));
  x = dfmul_vf2_vf2_vf2(t, x);

  x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x);

  u = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);
  
  return u;

#else // #if !defined(DETERMINISTIC)

  vint2 q;
  vfloat u, v;
  vfloat2 s, t, x;
  vopmask o;

  u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
  q = vrint_vi2_vf(u);
  v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
  s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));
  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));
  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));

  if (!LIKELY(vtestallones_i_vo32(g))) {
    dfi_t dfi = rempif(d);
    t = dfigetdf_vf2_dfi(dfi);
    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));
    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));
    t = vf2sety_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(t)))));
    q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));
    s = vsel_vf2_vo_vf2_vf2(g, s, t);
  }

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
  vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0)));
  s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(s)), n)));
  s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(s)), n)));

  t = s;
  s = dfsqu_vf2_vf2(s);
  s = dfnormalize_vf2_vf2(s);

  u = vcast_vf_f(0.00446636462584137916564941f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-8.3920182078145444393158e-05f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0109639242291450500488281f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0212360303848981857299805f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0540687143802642822265625f));

  x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s)));
  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s));
  x = dfmul_vf2_vf2_vf2(t, x);

  x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x);

  u = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);
  
  return u;
#endif // #if !defined(DETERMINISTIC)
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) {
  vfloat s, t, u;
  vint2 q;

  q = vsel_vi2_vf_vi2(d, vcast_vi2_i(2));
  s = vabs_vf_vf(d);

  q = vsel_vi2_vf_vf_vi2_vi2(vcast_vf_f(1.0f), s, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);
  s = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(1.0f), s), vrec_vf_vf(s), s);

  t = vmul_vf_vf_vf(s, s);

  vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2);
  u = POLY8(t, t2, t4,
	    0.00282363896258175373077393f,
	    -0.0159569028764963150024414f,
	    0.0425049886107444763183594f,
	    -0.0748900920152664184570312f,
	    0.106347933411598205566406f,
	    -0.142027363181114196777344f,
	    0.199926957488059997558594f,
	    -0.333331018686294555664062f);

  t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s);

  t = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), t), t);

  t = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(t)));

#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
  t = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t);
#endif

  return t;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) {
  vfloat s, t, u;
  vint2 q;
  vopmask p;

  q = vsel_vi2_vf_vi2(x, vcast_vi2_i(-2));
  x = vabs_vf_vf(x);

  q = vsel_vi2_vf_vf_vi2_vi2(x, y, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);
  p = vlt_vo_vf_vf(x, y);
  s = vsel_vf_vo_vf_vf(p, vneg_vf_vf(x), y);
  t = vmax_vf_vf_vf(x, y);

  s = vdiv_vf_vf_vf(s, t);
  t = vmul_vf_vf_vf(s, s);

  vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2);
  u = POLY8(t, t2, t4,
	    0.00282363896258175373077393f,
	    -0.0159569028764963150024414f,
	    0.0425049886107444763183594f,
	    -0.0748900920152664184570312f,
	    0.106347933411598205566406f,
	    -0.142027363181114196777344f,
	    0.199926957488059997558594f,
	    -0.333331018686294555664062f);

  t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s);
  t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t);

  return t;
}

static INLINE CONST VECTOR_CC vfloat visinf2_vf_vf_vf(vfloat d, vfloat m) {
  return vreinterpret_vf_vm(vand_vm_vo32_vm(visinf_vo_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), vreinterpret_vm_vf(m))));
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xatan2f(vfloat y, vfloat x) {
  vfloat r = atan2kf(vabs_vf_vf(y), x);

  r = vmulsign_vf_vf_vf(r, x);
  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0.0f))), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), x))), r);
  r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/4)), x))), r);

  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r);

  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y))));
  return r;
}

EXPORT CONST VECTOR_CC vfloat xasinf(vfloat d) {
  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f)));
  vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2)), u;

  u = vcast_vf_f(+0.4197454825e-1);
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
  u = vmla_vf_vf_vf_vf(u, vmul_vf_vf_vf(x, x2), x);

  vfloat r = vsel_vf_vo_vf_vf(o, u, vmla_vf_vf_vf_vf(u, vcast_vf_f(-2), vcast_vf_f(M_PIf/2)));
  return vmulsign_vf_vf_vf(r, d);
}

EXPORT CONST VECTOR_CC vfloat xacosf(vfloat d) {
  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d),
				vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;
  vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2));
  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf_f(0), x);

  u = vcast_vf_f(+0.4197454825e-1);
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, x));

  vfloat y = vsub_vf_vf_vf(vcast_vf_f(3.1415926535897932f/2), vadd_vf_vf_vf(vmulsign_vf_vf_vf(x, d), vmulsign_vf_vf_vf(u, d)));
  x = vadd_vf_vf_vf(x, u);
  vfloat r = vsel_vf_vo_vf_vf(o, y, vmul_vf_vf_vf(x, vcast_vf_f(2)));
  return vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))),
			  vf2getx_vf_vf2(dfadd_vf2_vf2_vf(vcast_vf2_f_f(3.1415927410125732422f,-8.7422776573475857731e-08f),
							  vneg_vf_vf(r))), r);
}
#endif // #if !defined(DETERMINISTIC)

//

static INLINE CONST VECTOR_CC vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) {
  vfloat u;
  vfloat2 s, t;
  vint2 q;
  vopmask p;
  vmask r;
  
  q = vsel_vi2_vf_vf_vi2_vi2(vf2getx_vf_vf2(x), vcast_vf_f(0), vcast_vi2_i(-2), vcast_vi2_i(0));
  p = vlt_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(0));
  r = vand_vm_vo32_vm(p, vreinterpret_vm_vf(vcast_vf_f(-0.0)));
  x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), r)));
  x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), r)));

  q = vsel_vi2_vf_vf_vi2_vi2(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);
  p = vlt_vo_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
  s = vsel_vf2_vo_vf2_vf2(p, dfneg_vf2_vf2(x), y);
  t = vsel_vf2_vo_vf2_vf2(p, y, x);

  s = dfdiv_vf2_vf2_vf2(s, t);
  t = dfsqu_vf2_vf2(s);
  t = dfnormalize_vf2_vf2(t);

  u = vcast_vf_f(-0.00176397908944636583328247f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.0107900900766253471374512f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.0309564601629972457885742f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.0577365085482597351074219f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.0838950723409652709960938f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.109463557600975036621094f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.142626821994781494140625f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.199983194470405578613281f));

  t = dfmul_vf2_vf2_vf2(t, dfadd_vf2_vf_vf(vcast_vf_f(-0.333332866430282592773438f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(t))));
  t = dfmul_vf2_vf2_vf2(s, dfadd_vf2_vf_vf2(vcast_vf_f(1), t));
  t = dfadd_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_f_f(1.5707963705062866211f, -4.3711388286737928865e-08f), vcast_vf_vi2(q)), t);

  return t;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xatan2f_u1(vfloat y, vfloat x) {
  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(2.9387372783541830947e-39f)); // nexttowardf((1.0 / FLT_MAX), 1)
  x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1 << 24)), x);
  y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1 << 24)), y);
  
  vfloat2 d = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(y), vcast_vf_f(0)), vcast_vf2_vf_vf(x, vcast_vf_f(0)));
  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));

  r = vmulsign_vf_vf_vf(r, x);
  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/2), x))), r);
  r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/4), x))), r);
  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r);

  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y))));
  return r;
}

EXPORT CONST VECTOR_CC vfloat xasinf_u1(vfloat d) {
  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;
  vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2));
  x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x);

  u = vcast_vf_f(+0.4197454825e-1);
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)));

  vfloat2 y = dfsub_vf2_vf2_vf(dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), u);
  
  vfloat r = vsel_vf_vo_vf_vf(o, vadd_vf_vf_vf(u, vf2getx_vf_vf2(x)),
			       vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)), vcast_vf_f(2)));
  return vmulsign_vf_vf_vf(r, d);
}

EXPORT CONST VECTOR_CC vfloat xacosf_u1(vfloat d) {
  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;
  vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2));
  x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x);

  u = vcast_vf_f(+0.4197454825e-1);
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)));

  vfloat2 y = dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/2, -8.7422776573475857731e-08f/2),
				 dfadd_vf2_vf_vf(vmulsign_vf_vf_vf(vf2getx_vf_vf2(x), d), vmulsign_vf_vf_vf(u, d)));
  x = dfadd_vf2_vf2_vf(x, u);

  y = vsel_vf2_vo_vf2_vf2(o, y, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
  
  y = vsel_vf2_vo_vf2_vf2(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))),
			  dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f, -8.7422776573475857731e-08f), y), y);

  return vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y));
}

EXPORT CONST VECTOR_CC vfloat xatanf_u1(vfloat d) {
  vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0));
  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2));
  r = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(1.570796326794896557998982), r);
  return vmulsign_vf_vf_vf(r, d);
}
#endif // #if !defined(DETERMINISTIC)

//

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xlogf(vfloat d) {
  vfloat x, x2, t, m;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
#endif
  
  x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1.0f)), vadd_vf_vf_vf(vcast_vf_f(1.0f), m));
  x2 = vmul_vf_vf_vf(x, x);

  t = vcast_vf_f(0.2392828464508056640625f);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), x);
  x = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NANf), x);
  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITYf), x);
#else
  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e));
  x = vfixup_vf_vf_vf_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0);
#endif
  
  return x;
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xexpf(vfloat d) {
  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));
  vfloat s, u;

  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);
  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);

  u = vcast_vf_f(0.000198527617612853646278381);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5));

  u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s));

  u = vldexp2_vf_vf_vi2(u, q);

  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u)));
  u = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(100), d), vcast_vf_f(SLEEF_INFINITYf), u);

  return u;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) {
  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));
  vfloat s, u;

  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);
  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);

  vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2);
  u = POLY6(s, s2, s4,
	    0.000198527617612853646278381,
	    0.00139304355252534151077271,
	    0.00833336077630519866943359,
	    0.0416664853692054748535156,
	    0.166666671633720397949219,
	    0.5);

  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s);

  u = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(q, vcast_vi2_i(0)), u,
		       vsub_vf_vf_vf(vldexp2_vf_vf_vi2(vadd_vf_vf_vf(u, vcast_vf_f(1)), q), vcast_vf_f(1)));

  return u;
}

#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) {
  vfloat e = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1))));
  vfloat m = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), vreinterpret_vi2_vf(d))));
  float32x4_t x = vrsqrteq_f32(m);
  x = vmulq_f32(x, vrsqrtsq_f32(m, vmulq_f32(x, x)));
  float32x4_t u = vmulq_f32(x, m);
  u = vmlaq_f32(u, vmlsq_f32(m, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
  e = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vm_vf(e)));
  u = vmul_vf_vf_vf(e, u);

  u = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), u);
  u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(d), vlt_vo_vf_vf(d, vcast_vf_f(0))), vreinterpret_vm_vf(u)));
  u = vmulsign_vf_vf_vf(u, d);

  return u;
}
#elif defined(ENABLE_VECEXT)
EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) {
  vfloat q = vsqrt_vf_vf(d);
  q = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), q);
  return vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), q);
}
#else
EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { return vsqrt_vf_vf(d); }
#endif

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xcbrtf(vfloat d) {
  vfloat x, y, q = vcast_vf_f(1.0), t;
  vint2 e, qu, re;

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  vfloat s = d;
#endif
  e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));
  d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e));

  t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144));
  qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0f/3.0f)));
  re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3))));

  q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
  q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
  q = vldexp2_vf_vf_vi2(q, vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048)));

  q = vmulsign_vf_vf_vf(q, d);
  d = vabs_vf_vf(d);

  x = vcast_vf_f(-0.601564466953277587890625f);
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f));

  y = vmul_vf_vf_vf(vmul_vf_vf_vf(d, x), x);
  y = vmul_vf_vf_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2.0f / 3.0f), y), vmla_vf_vf_vf_vf(y, x, vcast_vf_f(-1.0f)))), q);

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  y = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), s), y);
  y = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), y);
#endif
  
  return y;
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xcbrtf_u1(vfloat d) {
  vfloat x, y, z, t;
  vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v;
  vint2 e, qu, re;

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  vfloat s = d;
#endif
  e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));
  d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e));

  t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144));
  qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0/3.0)));
  re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3))));

  q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf2_f_f(1.2599210739135742188f, -2.4018701694217270415e-08), q2);
  q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf2_f_f(1.5874010324478149414f,  1.9520385308169352356e-08), q2);

  q2 = vf2setx_vf2_vf2_vf(q2, vmulsign_vf_vf_vf(vf2getx_vf_vf2(q2), d));
  q2 = vf2sety_vf2_vf2_vf(q2, vmulsign_vf_vf_vf(vf2gety_vf_vf2(q2), d));
  d = vabs_vf_vf(d);

  x = vcast_vf_f(-0.601564466953277587890625f);
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f));
  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f));

  y = vmul_vf_vf_vf(x, x); y = vmul_vf_vf_vf(y, y); x = vsub_vf_vf_vf(x, vmul_vf_vf_vf(vmlanp_vf_vf_vf_vf(d, y, x), vcast_vf_f(-1.0 / 3.0)));

  z = x;

  u = dfmul_vf2_vf_vf(x, x);
  u = dfmul_vf2_vf2_vf2(u, u);
  u = dfmul_vf2_vf2_vf(u, d);
  u = dfadd2_vf2_vf2_vf(u, vneg_vf_vf(x));
  y = vadd_vf_vf_vf(vf2getx_vf_vf2(u), vf2gety_vf_vf2(u));

  y = vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(-2.0 / 3.0), y), z);
  v = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(z, z), y);
  v = dfmul_vf2_vf2_vf(v, d);
  v = dfmul_vf2_vf2_vf2(v, q2);
  z = vldexp2_vf_vf_vi2(vadd_vf_vf_vf(vf2getx_vf_vf2(v), vf2gety_vf_vf2(v)), vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048)));

  z = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), vf2getx_vf_vf2(q2)), z);
  z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vf_vm(vsignbit_vm_vf(vf2getx_vf_vf2(q2))), z);

#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)
  z = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), s), z);
  z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), z);
#endif

  return z;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat2 logkf(vfloat d) {
  vfloat2 x, x2;
  vfloat t, m;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
#endif

  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
  x2 = dfsqu_vf2_vf2(x);

  t = vcast_vf_f(0.240320354700088500976562);
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.285112679004669189453125));
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.400007992982864379882812));
  vfloat2 c = vcast_vf2_f_f(0.66666662693023681640625f, 3.69183861259614332084311e-09f);

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));
#else
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);
#endif

  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x),
					     dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c)));
  return s;
}

static INLINE CONST VECTOR_CC vfloat logk3f(vfloat d) {
  vfloat x, x2, t, m;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
#endif

  x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1.0f)), vadd_vf_vf_vf(vcast_vf_f(1.0f), m));
  x2 = vmul_vf_vf_vf(x, x);

  t = vcast_vf_f(0.2392828464508056640625f);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
#else
  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e));
#endif

  return x;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xlogf_u1(vfloat d) {
  vfloat2 x;
  vfloat t, m, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);
#endif

  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));

  t = vcast_vf_f(+0.3027294874e+0f);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f));
  
  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));

  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), r);
  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NANf), r);
  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITYf), r);
#else
  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat expkf(vfloat2 d) {
  vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f));
  vint2 q = vrint_vi2_vf(u);
  vfloat2 s, t;

  s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));
  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));

  s = dfnormalize_vf2_vf2(s);

  u = vcast_vf_f(0.00136324646882712841033936f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00836596917361021041870117f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416710823774337768554688f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.166665524244308471679688f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.499999850988388061523438f));

  t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u));

  t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);
  u = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
  u = vldexp_vf_vf_vi2(u, q);

  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(u)));
  
  return u;
}

static INLINE CONST VECTOR_CC vfloat expk3f(vfloat d) {
  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));
  vfloat s, u;

  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);
  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);

  u = vcast_vf_f(0.000198527617612853646278381);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5));

  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, vadd_vf_vf_vf(s, vcast_vf_f(1.0f)));
  u = vldexp2_vf_vf_vi2(u, q);

  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u)));
  
  return u;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xpowf(vfloat x, vfloat y) {
#if 1
  vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));
  vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint),
				 vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));

#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
  yisodd = vandnot_vm_vo32_vm(visinf_vo_vf(y), yisodd);
#endif

  vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y));

  result = vsel_vf_vo_vf_vf(visnan_vo_vf(result), vcast_vf_f(SLEEF_INFINITYf), result);
  
  result = vmul_vf_vf_vf(result,
			 vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, vcast_vf_f(0)),
					  vcast_vf_f(1),
					  vsel_vf_vo_vf_vf(yisint, vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1.0f), vcast_vf_f(1)), vcast_vf_f(SLEEF_NANf))));

  vfloat efx = vmulsign_vf_vf_vf(vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), y);

  result = vsel_vf_vo_vf_vf(visinf_vo_vf(y),
			    vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(efx, vcast_vf_f(0.0f)),
								  vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(efx, vcast_vf_f(0.0f)),
												      vcast_vf_f(1.0f),
												      vcast_vf_f(SLEEF_INFINITYf))))),
			    result);

  result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))),
			    vmul_vf_vf_vf(vsel_vf_vo_vf_vf(yisodd, vsign_vf_vf(x), vcast_vf_f(1)),
					  vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vneg_vf_vf(y), y), vcast_vf_f(0)),
										vreinterpret_vm_vf(vcast_vf_f(SLEEF_INFINITYf))))),
			    result);

  result = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(result)));

  result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(y, vcast_vf_f(0)), veq_vo_vf_vf(x, vcast_vf_f(1))), vcast_vf_f(1), result);

  return result;
#else
  return expkf(dfmul_vf2_vf2_vf(logkf(x), y));
#endif
}

EXPORT CONST VECTOR_CC vfloat xfastpowf_u3500(vfloat x, vfloat y) {
  vfloat result = expk3f(vmul_vf_vf_vf(logk3f(vabs_vf_vf(x)), y));
  vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));
  vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint),
				 vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));

  result = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vsignbit_vo_vf(x), yisodd), vneg_vf_vf(result), result);

  result = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), result);
  result = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0)), vcast_vf_f(1), result);

  return result;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) {
  vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f));
  vint2 q = vrint_vi2_vf(u);
  vfloat2 s, t;

  s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));
  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));

  u = vcast_vf_f(+0.1980960224e-3f);
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.1394256484e-2f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.8333456703e-2f));
  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.4166637361e-1f));

  t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(s, u), vcast_vf_f(+0.166666659414234244790680580464e+0f));
  t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(s, t), vcast_vf_f(0.5));
  t = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfsqu_vf2_vf2(s), t));

  t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);

  t = vf2setx_vf2_vf2_vf(t, vldexp2_vf_vf_vi2(vf2getx_vf_vf2(t), q));
  t = vf2sety_vf2_vf2_vf(t, vldexp2_vf_vf_vi2(vf2gety_vf_vf2(t), q));

  t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));
  t = vf2sety_vf2_vf2_vf(t, vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(vf2gety_vf_vf2(t)))));

  return t;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) {
  vfloat y = vabs_vf_vf(x);
  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));
  d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d));
  y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)),
				    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);
  y = vmulsign_vf_vf_vf(y, x);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}

EXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) {
  vfloat y = vabs_vf_vf(x);
  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));
  d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d));
  y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)),
				    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}

EXPORT CONST VECTOR_CC vfloat xtanhf(vfloat x) {
  vfloat y = vabs_vf_vf(x);
  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));
  vfloat2 e = dfrec_vf2_vf2(d);
  d = dfdiv_vf2_vf2_vf2(dfadd_vf2_vf2_vf2(d, dfneg_vf2_vf2(e)), dfadd_vf2_vf2_vf2(d, e));
  y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)),
				    visnan_vo_vf(y)), vcast_vf_f(1.0f), y);
  y = vmulsign_vf_vf_vf(y, x);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}

EXPORT CONST VECTOR_CC vfloat xsinhf_u35(vfloat x) {
  vfloat e = expm1fk(vabs_vf_vf(x));
  vfloat y = vdiv_vf_vf_vf(vadd_vf_vf_vf(e, vcast_vf_f(2)), vadd_vf_vf_vf(e, vcast_vf_f(1)));
  y = vmul_vf_vf_vf(y, vmul_vf_vf_vf(vcast_vf_f(0.5f), e));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(88)),
				    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);
  y = vmulsign_vf_vf_vf(y, x);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}

EXPORT CONST VECTOR_CC vfloat xcoshf_u35(vfloat x) {
  vfloat e = xexpf(vabs_vf_vf(x));
  vfloat y = vmla_vf_vf_vf_vf(vcast_vf_f(0.5f), e, vdiv_vf_vf_vf(vcast_vf_f(0.5), e));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(88)),
				    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}

EXPORT CONST VECTOR_CC vfloat xtanhf_u35(vfloat x) {
  vfloat d = expm1fk(vmul_vf_vf_vf(vcast_vf_f(2), vabs_vf_vf(x)));
  vfloat y = vdiv_vf_vf_vf(d, vadd_vf_vf_vf(vcast_vf_f(2), d));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)),
				    visnan_vo_vf(y)), vcast_vf_f(1.0f), y);
  y = vmulsign_vf_vf_vf(y, x);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat2 logk2f(vfloat2 d) {
  vfloat2 x, x2, m, s;
  vfloat t;
  vint2 e;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  e = vilogbk_vi2_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(1.0f/0.75f)));
#else
  e = vrint_vi2_vf(vgetexp_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(1.0f/0.75f))));
#endif
  m = dfscale_vf2_vf2_vf(d, vpow2i_vf_vi2(vneg_vi2_vi2(e)));

  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(m, vcast_vf_f(-1)), dfadd2_vf2_vf2_vf(m, vcast_vf_f(1)));
  x2 = dfsqu_vf2_vf2(x);

  t = vcast_vf_f(0.2392828464508056640625f);
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.28518211841583251953125f));
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.400005877017974853515625f));
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.666666686534881591796875f));

  s = dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), vcast_vf_vi2(e));
  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t));

  return s;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xasinhf(vfloat x) {
  vfloat y = vabs_vf_vf(x);
  vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1));
  vfloat2 d;
  
  d = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf(x), vcast_vf2_vf_vf(y, vcast_vf_f(0)));
  d = dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(d), vcast_vf_f(1)));
  d = vsel_vf2_vo_vf2_vf2(o, dfmul_vf2_vf2_vf(d, y), d);

  d = logk2f(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(d, x)));
  y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),
				    visnan_vo_vf(y)),
		       vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), x), y);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
  y = vsel_vf_vo_vf_vf(visnegzero_vo_vf(x), vcast_vf_f(-0.0), y);

  return y;
}

EXPORT CONST VECTOR_CC vfloat xacoshf(vfloat x) {
  vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x));
  vfloat y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));

  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),
				    visnan_vo_vf(y)),
		       vcast_vf_f(SLEEF_INFINITYf), y);

  y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y)));

  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vlt_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y)));
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}

EXPORT CONST VECTOR_CC vfloat xatanhf(vfloat x) {
  vfloat y = vabs_vf_vf(x);
  vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y))));
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(y, vcast_vf_f(1.0)), vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(SLEEF_INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5))))));

  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(y)));
  y = vmulsign_vf_vf_vf(y, x);
  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));

  return y;
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xexp2f(vfloat d) {
  vfloat u = vrint_vf_vf(d), s;
  vint2 q = vrint_vi2_vf(u);

  s = vsub_vf_vf_vf(d, u);

  u = vcast_vf_f(+0.1535920892e-3);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0));

#ifdef ENABLE_FMA_SP
  u = vfma_vf_vf_vf_vf(u, s, vcast_vf_f(1));
#else
  u = vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(u, s))));
#endif
  
  u = vldexp2_vf_vf_vi2(u, q);

  u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(SLEEF_INFINITY), u);
  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u)));

  return u;
}

EXPORT CONST VECTOR_CC vfloat xexp2f_u35(vfloat d) {
  vfloat u = vrint_vf_vf(d), s;
  vint2 q = vrint_vi2_vf(u);

  s = vsub_vf_vf_vf(d, u);

  u = vcast_vf_f(+0.1535920892e-3);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1000000000e+1));
  
  u = vldexp2_vf_vf_vi2(u, q);

  u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(SLEEF_INFINITY), u);
  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u)));

  return u;
}

EXPORT CONST VECTOR_CC vfloat xexp10f(vfloat d) {
  vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s;
  vint2 q = vrint_vi2_vf(u);

  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d);
  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s);

  u = vcast_vf_f(+0.6802555919e-1);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2078080326e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5393903852e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171245337e+1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034678698e+1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650949001e+1));
  vfloat2 x = dfadd_vf2_vf2_vf(vcast_vf2_f_f(2.3025851249694824219, -3.1705172516493593157e-08), vmul_vf_vf_vf(u, s));
  u = vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf(x, s))));
  
  u = vldexp2_vf_vf_vi2(u, q);

  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(SLEEF_INFINITYf), u);
  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u)));

  return u;
}

EXPORT CONST VECTOR_CC vfloat xexp10f_u35(vfloat d) {
  vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s;
  vint2 q = vrint_vi2_vf(u);

  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d);
  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s);

  u = vcast_vf_f(+0.2064004987e+0);
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5417877436e+0));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171286821e+1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034656048e+1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650948763e+1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2302585125e+1));
  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1000000000e+1));
  
  u = vldexp2_vf_vf_vi2(u, q);

  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(SLEEF_INFINITYf), u);
  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u)));

  return u;
}

EXPORT CONST VECTOR_CC vfloat xexpm1f(vfloat a) {
  vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0));
  vfloat x = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));
  x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283172607421875f)), vcast_vf_f(SLEEF_INFINITYf), x);
  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-16.635532333438687426013570f)), vcast_vf_f(-1), x);
  x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x);
  return x;
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xlog10f(vfloat d) {
  vfloat2 x;
  vfloat t, m, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
#endif

  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));

  t = vcast_vf_f(+0.1314289868e+0);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.1735493541e+0));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.2895309627e+0));
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), vcast_vf_vi2(e));
#else
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), e);
#endif

  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(0.868588984, -2.170757285e-08)));
  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));

  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r);
  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r);
  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r);
#else
  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}

EXPORT CONST VECTOR_CC vfloat xlog2f(vfloat d) {
  vfloat2 x;
  vfloat t, m, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
#endif

  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));

  t = vcast_vf_f(+0.4374550283e+0f);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.5764790177e+0f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.9618012905120f));
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vfloat2 s = dfadd2_vf2_vf_vf2(vcast_vf_vi2(e),
				dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(2.8853900432586669922, 3.2734474483568488616e-08)));
#else
  vfloat2 s = dfadd2_vf2_vf_vf2(e,
				dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(2.8853900432586669922, 3.2734474483568488616e-08)));
#endif

  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));

  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r);
  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r);
  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r);
#else
  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif
  
  return r;
}

EXPORT CONST VECTOR_CC vfloat xlog2f_u35(vfloat d) {
  vfloat m, t, x, x2;

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  m = vgetmant_vf_vf(d);
#endif

  x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1)), vadd_vf_vf_vf(m, vcast_vf_f(1)));
  x2 = vmul_vf_vf_vf(x, x);

  t = vcast_vf_f(+0.4374088347e+0);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.5764843822e+0));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.9618024230e+0));
  
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vfloat r = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(x2, x), t,
			      vmla_vf_vf_vf_vf(x, vcast_vf_f(+0.2885390043e+1), vcast_vf_vi2(e)));

  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r);
  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r);
  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r);
#else
  vfloat r = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(x2, x), t,
			      vmla_vf_vf_vf_vf(x, vcast_vf_f(+0.2885390043e+1), e));

  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
#endif

  return r;
}

EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) {
  vfloat2 x;
  vfloat t, m, x2;

  vfloat dp1 = vadd_vf_vf_vf(d, vcast_vf_f(1));

#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
  vopmask o = vlt_vo_vf_vf(dp1, vcast_vf_f(FLT_MIN));
  dp1 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(dp1, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), dp1);
  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f)));
  t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(e));
  m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1)));
  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));
#else
  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f)));
  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
  t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(vrint_vi2_vf(e)));
  m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1)));
  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);
#endif

  x = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(m, vcast_vf_f(0)), dfadd_vf2_vf_vf(vcast_vf_f(2), m));
  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));

  t = vcast_vf_f(+0.3027294874e+0f);
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f));
  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f));
  
  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));

  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));
  
  r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(1e+38)), vcast_vf_f(SLEEF_INFINITYf), r);
  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), d), vreinterpret_vm_vf(r)));
  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(-1)), vcast_vf_f(-SLEEF_INFINITYf), r);
  r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), r);

  return r;
}
#endif // #if !defined(DETERMINISTIC)

//

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xfabsf(vfloat x) { return vabs_vf_vf(x); }

EXPORT CONST VECTOR_CC vfloat xcopysignf(vfloat x, vfloat y) { return vcopysign_vf_vf_vf(x, y); }

EXPORT CONST VECTOR_CC vfloat xfmaxf(vfloat x, vfloat y) {
#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)
  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmax_vf_vf_vf(x, y));
#else
  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, y), x, y));
#endif
}

EXPORT CONST VECTOR_CC vfloat xfminf(vfloat x, vfloat y) {
#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)
  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmin_vf_vf_vf(x, y));
#else
  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(y, x), x, y));
#endif
}

EXPORT CONST VECTOR_CC vfloat xfdimf(vfloat x, vfloat y) {
  vfloat ret = vsub_vf_vf_vf(x, y);
  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(ret, vcast_vf_f(0)), veq_vo_vf_vf(x, y)), vcast_vf_f(0), ret);
  return ret;
}

EXPORT CONST VECTOR_CC vfloat xtruncf(vfloat x) {
#ifdef FULL_FP_ROUNDING
  return vtruncate_vf_vf(x);
#else
  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));
#endif
}

EXPORT CONST VECTOR_CC vfloat xfloorf(vfloat x) {
  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
  fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);
  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));
}

EXPORT CONST VECTOR_CC vfloat xceilf(vfloat x) {
  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
  fr = vsel_vf_vo_vf_vf(vle_vo_vf_vf(fr, vcast_vf_f(0)), fr, vsub_vf_vf_vf(fr, vcast_vf_f(1.0f)));
  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));
}

EXPORT CONST VECTOR_CC vfloat xroundf(vfloat d) {
  vfloat x = vadd_vf_vf_vf(d, vcast_vf_f(0.5f));
  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
  x = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vle_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(fr, vcast_vf_f(0))), vsub_vf_vf_vf(x, vcast_vf_f(1.0f)), x);
  fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);
  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0.4999999701976776123f)), vcast_vf_f(0), x);
  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(d), vge_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INT64_C(1) << 23))), d, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), d));
}

EXPORT CONST VECTOR_CC vfloat xrintf(vfloat d) {
#ifdef FULL_FP_ROUNDING
  return vrint_vf_vf(d);
#else
  vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), d);
  return vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1 << 23)),
			  d, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(d, c), c), d));
#endif
}

EXPORT CONST VECTOR_CC vfloat xfmaf(vfloat x, vfloat y, vfloat z) {
#ifdef ENABLE_FMA_SP
  return vfma_vf_vf_vf_vf(x, y, z);
#else
  vfloat h2 = vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z), q = vcast_vf_f(1);
  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e-38f));
  {
    const float c0 = UINT64_C(1) << 25, c1 = c0 * c0, c2 = c1 * c1;
    x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(c1)), x);
    y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(c1)), y);
    z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(c2)), z);
    q = vsel_vf_vo_vf_vf(o, vcast_vf_f(1.0f / c2), q);
  }
  o = vgt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e+38f));
  {
    const float c0 = UINT64_C(1) << 25, c1 = c0 * c0, c2 = c1 * c1;
    x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1.0f / c1)), x);
    y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1.0f / c1)), y);
    z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(1.0f / c2)), z);
    q = vsel_vf_vo_vf_vf(o, vcast_vf_f(c2), q);
  }
  vfloat2 d = dfmul_vf2_vf_vf(x, y);
  d = dfadd2_vf2_vf2_vf(d, z);
  vfloat ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), z, vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
  o = visinf_vo_vf(z);
  o = vandnot_vo_vo_vo(visinf_vo_vf(x), o);
  o = vandnot_vo_vo_vo(visnan_vo_vf(x), o);
  o = vandnot_vo_vo_vo(visinf_vo_vf(y), o);
  o = vandnot_vo_vo_vo(visnan_vo_vf(y), o);
  h2 = vsel_vf_vo_vf_vf(o, z, h2);

  o = vor_vo_vo_vo(visinf_vo_vf(h2), visnan_vo_vf(h2));
  
  return vsel_vf_vo_vf_vf(o, h2, vmul_vf_vf_vf(ret, q));
#endif
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(SLEEF_GENHEADER)
static INLINE CONST VECTOR_CC vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); }
#endif

SQRTFU05_FUNCATR VECTOR_CC vfloat xsqrtf_u05(vfloat d) {
#if defined(ENABLE_FMA_SP)
  vfloat q, w, x, y, z;

  d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), d);

  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d);
  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f), vcast_vf_f(1.0f));

  y = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f3759df), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1)));

  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);
  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);

  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
  w = vmul_vf_vf_vf(w, y);
  x = vmul_vf_vf_vf(w, d);
  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));

  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
  w = vfma_vf_vf_vf_vf(w, z, y);
  w = vadd_vf_vf_vf(w, x);

  w = vmul_vf_vf_vf(w, q);

  w = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(d, vcast_vf_f(0)),
				    veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);

  w = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), w);

  return w;
#else
  vfloat q;
  vopmask o;
  
  d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), d);

  o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d);
  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f*0.5f), vcast_vf_f(0.5f));

  o = vgt_vo_vf_vf(d, vcast_vf_f(1.8446744073709552e+19f));
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(5.4210108624275220e-20f)), d);
  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(4294967296.0f * 0.5f), q);

  vfloat x = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f375a86), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(vadd_vf_vf_vf(d, vcast_vf_f(1e-45f))), 1)));

  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));
  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));
  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));
  x = vmul_vf_vf_vf(x, d);

  vfloat2 d2 = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(x, x)), dfrec_vf2_vf(x));

  x = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2)), q);

  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), x);
  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), d, x);
  
  return x;
#endif
}

EXPORT CONST VECTOR_CC vfloat xsqrtf(vfloat d) {
#ifdef ACCURATE_SQRT
  return vsqrt_vf_vf(d);
#else
  // fall back to approximation if ACCURATE_SQRT is undefined
  return xsqrtf_u05(d);
#endif
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xhypotf_u05(vfloat x, vfloat y) {
  x = vabs_vf_vf(x);
  y = vabs_vf_vf(y);
  vfloat min = vmin_vf_vf_vf(x, y), n = min;
  vfloat max = vmax_vf_vf_vf(x, y), d = max;

  vopmask o = vlt_vo_vf_vf(max, vcast_vf_f(FLT_MIN));
  n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 24)), n);
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(UINT64_C(1) << 24)), d);

  vfloat2 t = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(n, vcast_vf_f(0)), vcast_vf2_vf_vf(d, vcast_vf_f(0)));
  t = dfmul_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(t), vcast_vf_f(1))), max);
  vfloat ret = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
  ret = vsel_vf_vo_vf_vf(visnan_vo_vf(ret), vcast_vf_f(SLEEF_INFINITYf), ret);
  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret);
  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret);
  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(SLEEF_INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(SLEEF_INFINITYf))), vcast_vf_f(SLEEF_INFINITYf), ret);

  return ret;
}

EXPORT CONST VECTOR_CC vfloat xhypotf_u35(vfloat x, vfloat y) {
  x = vabs_vf_vf(x);
  y = vabs_vf_vf(y);
  vfloat min = vmin_vf_vf_vf(x, y), n = min;
  vfloat max = vmax_vf_vf_vf(x, y), d = max;

  vfloat t = vdiv_vf_vf_vf(min, max);
  vfloat ret = vmul_vf_vf_vf(max, vsqrt_vf_vf(vmla_vf_vf_vf_vf(t, t, vcast_vf_f(1))));
  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret);
  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret);
  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(SLEEF_INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(SLEEF_INFINITYf))), vcast_vf_f(SLEEF_INFINITYf), ret);

  return ret;
}

EXPORT CONST VECTOR_CC vfloat xnextafterf(vfloat x, vfloat y) {
  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), y), x);
  vint2 t, xi2 = vreinterpret_vi2_vf(x);
  vopmask c = vxor_vo_vo_vo(vsignbit_vo_vf(x), vge_vo_vf_vf(y, x));

  xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2);

  xi2 = vsel_vi2_vo_vi2_vi2(vneq_vo_vf_vf(x, y), vsub_vi2_vi2_vi2(xi2, vcast_vi2_i(1)), xi2);

  xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2);

  vfloat ret = vreinterpret_vf_vi2(xi2);

  ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(ret, vcast_vf_f(0)), vneq_vo_vf_vf(x, vcast_vf_f(0))), 
			 vmulsign_vf_vf_vf(vcast_vf_f(0), x), ret);

  ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), y, ret);

  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret);
  
  return ret;
}

EXPORT CONST VECTOR_CC vfloat xfrfrexpf(vfloat x) {
  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 30)), x);

  vmask xm = vreinterpret_vm_vf(x);
  xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7f800000U, ~0x7f800000U));
  xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3f000000U,  0x3f000000U));

  vfloat ret = vreinterpret_vf_vm(xm);

  ret = vsel_vf_vo_vf_vf(visinf_vo_vf(x), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), x), ret);
  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), x, ret);
  
  return ret;
}
#endif // #if !defined(DETERMINISTIC)

EXPORT CONST VECTOR_CC vint2 xexpfrexpf(vfloat x) {
  /*
  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 63)), x);

  vint ret = vcastu_vi_vi2(vreinterpret_vi2_vf(x));
  ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe));

  ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), visnan_vo_vf(x)), visinf_vo_vf(x)), vcast_vi_i(0), ret);
  
  return ret;
  */
  return vcast_vi2_i(0);
}

static INLINE CONST VECTOR_CC vfloat vtoward0f(vfloat x) {
  vfloat t = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vreinterpret_vi2_vf(x), vcast_vi2_i(1)));
  return vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), t);
}

static INLINE CONST VECTOR_CC vfloat vptruncf(vfloat x) {
#ifdef FULL_FP_ROUNDING
  return vtruncate_vf_vf(x);
#else
  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
  return vsel_vf_vo_vf_vf(vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), x, vsub_vf_vf_vf(x, fr));
#endif
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) {
  vfloat nu = vabs_vf_vf(x), de = vabs_vf_vf(y), s = vcast_vf_f(1), q;
  vopmask o = vlt_vo_vf_vf(de, vcast_vf_f(FLT_MIN));
  nu = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(nu, vcast_vf_f(UINT64_C(1) << 25)), nu);
  de = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(de, vcast_vf_f(UINT64_C(1) << 25)), de);
  s  = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (UINT64_C(1) << 25))), s);
  vfloat rde = vtoward0f(vrec_vf_vf(de));
#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
  rde = vtoward0f(rde);
#endif
  vfloat2 r = vcast_vf2_vf_vf(nu, vcast_vf_f(0));

  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
    q = vptruncf(vmul_vf_vf_vf(vtoward0f(vf2getx_vf_vf2(r)), rde));
    q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(3), de), vf2getx_vf_vf2(r)),
				       vge_vo_vf_vf(vf2getx_vf_vf2(r), de)),
			 vcast_vf_f(2), q);
    q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2), de), vf2getx_vf_vf2(r)),
				       vge_vo_vf_vf(vf2getx_vf_vf2(r), de)),
			 vcast_vf_f(1), q);
    r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(vptruncf(q), vneg_vf_vf(de))));
    if (vtestallones_i_vo32(vlt_vo_vf_vf(vf2getx_vf_vf2(r), de))) break;
  }
  
  vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), s);
  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), de), vcast_vf_f(0), ret);

  ret = vmulsign_vf_vf_vf(ret, x);

  ret = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(nu, de), x, ret);
  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(de, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), ret);

  return ret;
}

static INLINE CONST VECTOR_CC vfloat vrintfk2_vf_vf(vfloat d) {
#ifdef FULL_FP_ROUNDING
  return vrint_vf_vf(d);
#else
  vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), d);
  return vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1 << 23)),
			  d, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(d, c), c), d));
#endif
}

EXPORT CONST VECTOR_CC vfloat xremainderf(vfloat x, vfloat y) {
  vfloat n = vabs_vf_vf(x), d = vabs_vf_vf(y), s = vcast_vf_f(1), q;
  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN*2));
  n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 25)), n);
  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(UINT64_C(1) << 25)), d);
  s  = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (UINT64_C(1) << 25))), s);
  vfloat2 r = vcast_vf2_vf_vf(n, vcast_vf_f(0));
  vfloat rd = vrec_vf_vf(d);
  vopmask qisodd = vneq_vo_vf_vf(vcast_vf_f(0), vcast_vf_f(0));

  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
    q = vrintfk2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(r), rd));
    q = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(1.5f))), vmulsign_vf_vf_vf(vcast_vf_f(1.0f), vf2getx_vf_vf2(r)), q);
    q = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(0.5f))),
				      vandnot_vo_vo_vo(qisodd, veq_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(0.5f))))),
			 vcast_vf_f(0.0), q);
    if (vtestallones_i_vo32(veq_vo_vf_vf(q, vcast_vf_f(0)))) break;
    q = vsel_vf_vo_vf_vf(visinf_vo_vf(vmul_vf_vf_vf(q, vneg_vf_vf(d))), vadd_vf_vf_vf(q, vmulsign_vf_vf_vf(vcast_vf_f(-1), vf2getx_vf_vf2(r))), q);
    qisodd = vxor_vo_vo_vo(qisodd, vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(q), vcast_vi2_i(1)), vcast_vi2_i(1)),
						 vlt_vo_vf_vf(vabs_vf_vf(q), vcast_vf_f(1 << 24))));
    r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(q, vneg_vf_vf(d))));
  }
  
  vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), s);
  ret = vmulsign_vf_vf_vf(ret, x);
  ret = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsel_vf_vo_vf_vf(visinf_vo_vf(x), vcast_vf_f(SLEEF_NANf), x), ret);
  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), ret);
  return ret;
}
#endif // #if !defined(DETERMINISTIC)

//

static INLINE CONST VECTOR_CC vfloat2 sinpifk(vfloat d) {
  vopmask o;
  vfloat u, s, t;
  vfloat2 x, s2;

  u = vmul_vf_vf_vf(d, vcast_vf_f(4.0));
  vint2 q = vtruncate_vi2_vf(u);
  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));

  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));
  t = s;
  s = vmul_vf_vf_vf(s, s);
  s2 = dfmul_vf2_vf_vf(t, t);

  //

  u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f);
  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f));
  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f));
  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s),
			vsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10,
					    -0.080745510756969451904, -1.3373665339076936258e-09));
  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x),
			 vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09,
					     0.78539818525314331055, -2.1857338617566484855e-08));

  x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0))));
  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x);

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));
  x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))));
  x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));

  return x;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xsinpif_u05(vfloat d) {
  vfloat2 x = sinpifk(d);
  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), r);
  r = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vreinterpret_vm_vf(r)));
  r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r)));
  
  return r;
}
#endif // #if !defined(DETERMINISTIC)

static INLINE CONST VECTOR_CC vfloat2 cospifk(vfloat d) {
  vopmask o;
  vfloat u, s, t;
  vfloat2 x, s2;

  u = vmul_vf_vf_vf(d, vcast_vf_f(4.0));
  vint2 q = vtruncate_vi2_vf(u);
  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));

  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));
  t = s;
  s = vmul_vf_vf_vf(s, s);
  s2 = dfmul_vf2_vf_vf(t, t);
  
  //

  u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f);
  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f));
  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f));
  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s),
			vsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10,
					    -0.080745510756969451904, -1.3373665339076936258e-09));
  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x),
			 vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09,
					     0.78539818525314331055, -2.1857338617566484855e-08));

  x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0))));
  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x);

  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));
  x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))));
  x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));

  return x;
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xcospif_u05(vfloat d) {
  vfloat2 x = cospifk(d);
  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));

  r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vcast_vf_f(1), r);
  r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r)));
  
  return r;
}
#endif // #if !defined(DETERMINISTIC)

#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))
  typedef struct {
    vfloat2 a, b;
  } df2;

static df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) {
  df2 r = { a, b };
  return r;
}
static vfloat2 df2geta_vf2_df2(df2 d) { return d.a; }
static vfloat2 df2getb_vf2_df2(df2 d) { return d.b; }
#endif

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
static CONST df2 gammafk(vfloat a) {
  vfloat2 clc = vcast_vf2_f_f(0, 0), clln = vcast_vf2_f_f(1, 0), clld = vcast_vf2_f_f(1, 0);
  vfloat2 v = vcast_vf2_f_f(1, 0), x, y, z;
  vfloat t, u;

  vopmask otiny = vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(1e-30f)), oref = vlt_vo_vf_vf(a, vcast_vf_f(0.5));

  x = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(0, 0),
			  vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(a)),
					      vcast_vf2_vf_vf(a, vcast_vf_f(0))));

  vopmask o0 = vand_vo_vo_vo(vle_vo_vf_vf(vcast_vf_f(0.5), vf2getx_vf_vf2(x)), vle_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(1.2)));
  vopmask o2 = vle_vo_vf_vf(vcast_vf_f(2.3), vf2getx_vf_vf2(x));
  
  y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x));
  y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(2)), y));

  vopmask o = vand_vo_vo_vo(o2, vle_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(7)));
  clln = vsel_vf2_vo_vf2_vf2(o, y, clln);

  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(3)), x);
  t = vsel_vf_vo_vf_vf(o2, vrec_vf_vf(vf2getx_vf_vf2(x)), vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(x, vsel_vf_vo_f_f(o0, -1, -2)))));

  u = vsel_vf_vo_vo_f_f_f(o2, o0, +0.000839498720672087279971000786, +0.9435157776e+0f, +0.1102489550e-3f);
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -5.17179090826059219329394422e-05, +0.8670063615e+0f, +0.8160019934e-4f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000592166437353693882857342347, +0.4826702476e+0f, +0.1528468856e-3f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +6.97281375836585777403743539e-05, -0.8855129778e-1f, -0.2355068718e-3f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.000784039221720066627493314301, +0.1013825238e+0f, +0.4962242092e-3f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000229472093621399176949318732, -0.1493408978e+0f, -0.1193488017e-2f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.002681327160493827160473958490, +0.1697509140e+0f, +0.2891599433e-2f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.003472222222222222222175164840, -0.2072454542e+0f, -0.7385451812e-2f));
  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.083333333333333333335592087900, +0.2705872357e+0f, +0.2058077045e-1f));

  y = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(-0.5)), logk2f(x));
  y = dfadd2_vf2_vf2_vf2(y, dfneg_vf2_vf2(x));
  y = dfadd2_vf2_vf2_vf2(y, vcast_vf2_d(0.91893853320467278056)); // 0.5*log(2*M_PI)

  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf (u, t), vsel_vf_vo_f_f(o0, -0.400686534596170958447352690395e+0f, -0.673523028297382446749257758235e-1f));
  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, +0.822466960142643054450325495997e+0f, +0.322467033928981157743538726901e+0f));
  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, -0.577215665946766039837398973297e+0f, +0.422784335087484338986941629852e+0f));
  z = dfmul_vf2_vf2_vf(z, t);

  clc = vsel_vf2_vo_vf2_vf2(o2, y, z);
  
  clld = vsel_vf2_vo_vf2_vf2(o2, dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(u, t), vcast_vf_f(1)), clld);
  
  y = clln;

  clc = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_d(41.58883083359671856503), // log(2^60)
			    vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf2_vf2(vcast_vf2_d(1.1447298858494001639), dfneg_vf2_vf2(clc)), clc)); // log(M_PI)
  clln = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(1, 0), vsel_vf2_vo_vf2_vf2(oref, clln, clld));

  if (!vtestallones_i_vo32(vnot_vo32_vo32(oref))) {
    t = vsub_vf_vf_vf(a, vmul_vf_vf_vf(vcast_vf_f(INT64_C(1) << 12), vcast_vf_vi2(vtruncate_vi2_vf(vmul_vf_vf_vf(a, vcast_vf_f(1.0 / (INT64_C(1) << 12)))))));
    x = dfmul_vf2_vf2_vf2(clld, sinpifk(t));
  }
  
  clld = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_vf_vf(vmul_vf_vf_vf(a, vcast_vf_f((INT64_C(1) << 30)*(float)(INT64_C(1) << 30))), vcast_vf_f(0)),
			     vsel_vf2_vo_vf2_vf2(oref, x, y));

  return df2setab_df2_vf2_vf2(clc, dfdiv_vf2_vf2_vf2(clln, clld));
}

#if !defined(DETERMINISTIC)
EXPORT CONST VECTOR_CC vfloat xtgammaf_u1(vfloat a) {
  df2 d = gammafk(a);
  vfloat2 y = dfmul_vf2_vf2_vf2(expk2f(df2geta_vf2_df2(d)), df2getb_vf2_df2(d));
  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y));
  vopmask o;

  o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(-SLEEF_INFINITYf)),
				vand_vo_vo_vo(vlt_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a))),
		   vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vf(a), vlt_vo_vf_vf(a, vcast_vf_f(0))), visnan_vo_vf(r)));
  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(SLEEF_NANf), r);

  o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(SLEEF_INFINITYf)), visnumber_vo_vf(a)),
				  vge_vo_vf_vf(a, vcast_vf_f(-FLT_MIN))),
		    vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(0)), vgt_vo_vf_vf(a, vcast_vf_f(36))), visnan_vo_vf(r)));
  r = vsel_vf_vo_vf_vf(o, vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), a), r);
  
  return r;
}

EXPORT CONST VECTOR_CC vfloat xlgammaf_u1(vfloat a) {
  df2 d = gammafk(a);
  vfloat2 y = dfadd2_vf2_vf2_vf2(df2geta_vf2_df2(d), logk2f(dfabs_vf2_vf2(df2getb_vf2_df2(d))));
  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y));
  vopmask o;

  o = vor_vo_vo_vo(visinf_vo_vf(a),
		   vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a)),
				vand_vo_vo_vo(visnumber_vo_vf(a), visnan_vo_vf(r))));
  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(SLEEF_INFINITYf), r);

  return r;
}

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
EXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) {
  vfloat s = a, t, u;
  vfloat2 d;

  a = vabs_vf_vf(a);
  vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.1));
  vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.4));
  vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.0));
  u = vsel_vf_vo_vf_vf(o0, vmul_vf_vf_vf(a, a), a);
  
  t = vsel_vf_vo_vo_f_f_f(o0, o1, +0.7089292194e-4f, -0.1792667899e-4f, -0.9495757695e-5f);
  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.7768311189e-3f, +0.3937633010e-3f, +0.2481465926e-3f));
  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.5159463733e-2f, -0.3949181177e-2f, -0.2918176819e-2f));
  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.2683781274e-1f, +0.2445474640e-1f, +0.2059706673e-1f));
  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.1128318012e+0f, -0.1070996150e+0f, -0.9901899844e-1f));
  d = dfmul_vf2_vf_vf(t, u);
  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, -0.376125876000657465175213237214e+0, -0.634588905908410389971210809210e+0, -0.643598050547891613081201721633e+0));
  d = dfmul_vf2_vf2_vf(d, u);
  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, +0.112837916021059138255978217023e+1, -0.112879855826694507209862753992e+1, -0.112461487742845562801052956293e+1));
  d = dfmul_vf2_vf2_vf(d, a);
  d = vsel_vf2_vo_vf2_vf2(o0, d, dfadd_vf2_vf_vf2(vcast_vf_f(1.0), dfneg_vf2_vf2(expk2f(d))));
  u = vmulsign_vf_vf_vf(vsel_vf_vo_vf_vf(o2, vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(1)), s);
  u = vsel_vf_vo_vf_vf(visnan_vo_vf(a), vcast_vf_f(SLEEF_NANf), u);

  return u;
}

/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
EXPORT CONST VECTOR_CC vfloat xerfcf_u15(vfloat a) {
  vfloat s = a, r = vcast_vf_f(0), t;
  vfloat2 u, d, x;
  a = vabs_vf_vf(a);
  vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.0));
  vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.2));
  vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.3));
  vopmask o3 = vlt_vo_vf_vf(a, vcast_vf_f(10.1));

  u = vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_vf_vf(a, vcast_vf_f(0)), dfdiv_vf2_vf2_vf2(vcast_vf2_f_f(1, 0), vcast_vf2_vf_vf(a, vcast_vf_f(0))));

  t = vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.8638041618e-4f, -0.6236977242e-5f, -0.3869504035e+0f, +0.1115344167e+1f);
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.6000166177e-3f, +0.5749821503e-4f, +0.1288077235e+1f, -0.9454904199e+0f));
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.1665703603e-2f, +0.6002851478e-5f, -0.1816803217e+1f, -0.3667259514e+0f));
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1795156277e-3f, -0.2851036377e-2f, +0.1249150872e+1f, +0.7155663371e+0f));
  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1914106123e-1f, +0.2260518074e-1f, -0.1328857988e+0f, -0.1262947265e-1f));

  d = dfmul_vf2_vf2_vf(u, t);
  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.102775359343930288081655368891e+0, -0.105247583459338632253369014063e+0, -0.482365310333045318680618892669e+0, -0.498961546254537647970305302739e+0));
  d = dfmul_vf2_vf2_vf2(d, u);
  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.636619483208481931303752546439e+0, -0.635609463574589034216723775292e+0, -0.134450203224533979217859332703e-2, -0.471199543422848492080722832666e-4));
  d = dfmul_vf2_vf2_vf2(d, u);
  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.112837917790537404939545770596e+1, -0.112855987376668622084547028949e+1, -0.572319781150472949561786101080e+0, -0.572364030327966044425932623525e+0));
  
  x = dfmul_vf2_vf2_vf(vsel_vf2_vo_vf2_vf2(o1, d, vcast_vf2_vf_vf(vneg_vf_vf(a), vcast_vf_f(0))), a);
  x = vsel_vf2_vo_vf2_vf2(o1, x, dfadd2_vf2_vf2_vf2(x, d));

  x = expk2f(x);
  x = vsel_vf2_vo_vf2_vf2(o1, x, dfmul_vf2_vf2_vf2(x, u));

  r = vsel_vf_vo_vf_vf(o3, vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vcast_vf_f(0));
  r = vsel_vf_vo_vf_vf(vsignbit_vo_vf(s), vsub_vf_vf_vf(vcast_vf_f(2), r), r);
  r = vsel_vf_vo_vf_vf(visnan_vo_vf(s), vcast_vf_f(SLEEF_NANf), r);
  return r;
}
#endif // #if !defined(DETERMINISTIC)

#if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)
// See sleefsimddp.c for explanation of these macros

#ifdef ENABLE_ALIAS
#define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) )));
#define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) )));
#else
#define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat d) { return x ## FUNC (d); }
#define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat d) { return x ## FUNC (d); }
#define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y) { return x ## FUNC (x, y); }
#define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y, vfloat z) { return x ## FUNC (x, y, z); }
#endif

/* DALIAS_vf2_vf(sincospif_u05) */
/* DALIAS_vf2_vf(sincospif_u35) */
/* DALIAS_vf2_vf(modff) */
/* DALIAS_vf_vf(atanf) */
/* DALIAS_vf_vf_vf(atan2f) */
/* DALIAS_vf_vf(asinf) */
/* DALIAS_vf_vf(acosf) */
/* DALIAS_vf_vf_vf(atan2f_u1) */
/* DALIAS_vf_vf(asinf_u1) */
/* DALIAS_vf_vf(acosf_u1) */
/* DALIAS_vf_vf(atanf_u1) */
/* DALIAS_vf_vf(logf) */
/* DALIAS_vf_vf(expf) */
/* DALIAS_vf_vf(cbrtf) */
/* DALIAS_vf_vf(cbrtf_u1) */
/* DALIAS_vf_vf(logf_u1) */
/* DALIAS_vf_vf_vf(powf) */
/* DALIAS_vf_vf(sinhf) */
/* DALIAS_vf_vf(coshf) */
/* DALIAS_vf_vf(tanhf) */
/* DALIAS_vf_vf(sinhf_u35) */
/* DALIAS_vf_vf(coshf_u35) */
/* DALIAS_vf_vf(tanhf_u35) */
/* DALIAS_vf_vf(asinhf) */
/* DALIAS_vf_vf(acoshf) */
/* DALIAS_vf_vf(atanhf) */
/* DALIAS_vf_vf(exp2f) */
/* DALIAS_vf_vf(exp2f_u35) */
/* DALIAS_vf_vf(exp10f) */
/* DALIAS_vf_vf(exp10f_u35) */
/* DALIAS_vf_vf(expm1f) */
/* DALIAS_vf_vf(log10f) */
/* DALIAS_vf_vf(log2f) */
/* DALIAS_vf_vf(log2f_u35) */
/* DALIAS_vf_vf(log1pf) */
/* DALIAS_vf_vf(fabsf) */
/* DALIAS_vf_vf_vf(copysignf) */
/* DALIAS_vf_vf_vf(fmaxf) */
/* DALIAS_vf_vf_vf(fminf) */
/* DALIAS_vf_vf_vf(fdimf) */
/* DALIAS_vf_vf(truncf) */
/* DALIAS_vf_vf(floorf) */
/* DALIAS_vf_vf(ceilf) */
/* DALIAS_vf_vf(roundf) */
/* DALIAS_vf_vf(rintf) */
/* DALIAS_vf_vf_vf_vf(fmaf) */
/* DALIAS_vf_vf_vf(hypotf_u05) */
/* DALIAS_vf_vf_vf(hypotf_u35) */
/* DALIAS_vf_vf_vf(nextafterf) */
/* DALIAS_vf_vf(frfrexpf) */
/* DALIAS_vf_vf_vf(fmodf) */
/* DALIAS_vf_vf_vf(remainderf) */
/* DALIAS_vf_vf(sinpif_u05) */
/* DALIAS_vf_vf(cospif_u05) */
/* DALIAS_vf_vf(tgammaf_u1) */
/* DALIAS_vf_vf(lgammaf_u1) */
/* DALIAS_vf_vf(erff_u1) */
/* DALIAS_vf_vf(erfcf_u15) */
/* DALIAS_vf_vf_vf(fastpowf_u3500) */
#endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)

#if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)
EXPORT CONST int xgetIntf(int name) {
  if (1 <= name && name <= 10) return vavailability_i(name);
  return 0;
}

EXPORT CONST void *xgetPtrf(int name) {
  if (name == 0) return ISANAME;
  return (void *)0;
}
#endif

#if defined(ALIAS_NO_EXT_SUFFIX) && !defined(DETERMINISTIC)
#include ALIAS_NO_EXT_SUFFIX
#endif

#ifdef ENABLE_GNUABI
EXPORT CONST VECTOR_CC vfloat __acosf_finite     (vfloat)         __attribute__((weak, alias(str_xacosf_u1  )));
EXPORT CONST VECTOR_CC vfloat __acoshf_finite    (vfloat)         __attribute__((weak, alias(str_xacoshf    )));
EXPORT CONST VECTOR_CC vfloat __asinf_finite     (vfloat)         __attribute__((weak, alias(str_xasinf_u1  )));
EXPORT CONST VECTOR_CC vfloat __atan2f_finite    (vfloat, vfloat) __attribute__((weak, alias(str_xatan2f_u1 )));
EXPORT CONST VECTOR_CC vfloat __atanhf_finite    (vfloat)         __attribute__((weak, alias(str_xatanhf    )));
EXPORT CONST VECTOR_CC vfloat __coshf_finite     (vfloat)         __attribute__((weak, alias(str_xcoshf     )));
EXPORT CONST VECTOR_CC vfloat __exp10f_finite    (vfloat)         __attribute__((weak, alias(str_xexp10f    )));
EXPORT CONST VECTOR_CC vfloat __exp2f_finite     (vfloat)         __attribute__((weak, alias(str_xexp2f     )));
EXPORT CONST VECTOR_CC vfloat __expf_finite      (vfloat)         __attribute__((weak, alias(str_xexpf      )));
EXPORT CONST VECTOR_CC vfloat __fmodf_finite     (vfloat, vfloat) __attribute__((weak, alias(str_xfmodf     )));
EXPORT CONST VECTOR_CC vfloat __remainderf_finite(vfloat, vfloat) __attribute__((weak, alias(str_xremainderf)));
EXPORT CONST VECTOR_CC vfloat __modff_finite      (vfloat, vfloat *) __attribute__((weak, alias(str_xmodff  )));
EXPORT CONST VECTOR_CC vfloat __hypotf_u05_finite(vfloat, vfloat) __attribute__((weak, alias(str_xhypotf_u05)));
EXPORT CONST VECTOR_CC vfloat __lgammaf_u1_finite(vfloat)         __attribute__((weak, alias(str_xlgammaf_u1)));
EXPORT CONST VECTOR_CC vfloat __log10f_finite    (vfloat)         __attribute__((weak, alias(str_xlog10f    )));
EXPORT CONST VECTOR_CC vfloat __logf_finite      (vfloat)         __attribute__((weak, alias(str_xlogf_u1   )));
EXPORT CONST VECTOR_CC vfloat __powf_finite      (vfloat, vfloat) __attribute__((weak, alias(str_xpowf      )));
EXPORT CONST VECTOR_CC vfloat __sinhf_finite     (vfloat)         __attribute__((weak, alias(str_xsinhf     )));
EXPORT CONST VECTOR_CC vfloat __sqrtf_finite     (vfloat)         __attribute__((weak, alias(str_xsqrtf     )));
EXPORT CONST VECTOR_CC vfloat __tgammaf_u1_finite(vfloat)         __attribute__((weak, alias(str_xtgammaf_u1)));

#ifdef HEADER_MASKED
#include HEADER_MASKED
#endif
#endif /* #ifdef ENABLE_GNUABI */

#ifdef ENABLE_MAIN
// gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimdsp.c rempitab.c ../common/common.c -lm
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char **argv) {
  vfloat vf1 = vcast_vf_f(atof(argv[1]));
  //vfloat vf2 = vcast_vf_f(atof(argv[2]));

  //vfloat r = xpowf(vf1, vf2);
  //vfloat r = xsqrtf_u05(vf1);
  //printf("%g\n", xnextafterf(vf1, vf2)[0]);
  //printf("%g\n", nextafterf(atof(argv[1]), atof(argv[2])));
  printf("t = %.20g\n", xlogf_u1(vf1)[0]);
  printf("c = %.20g\n", logf(atof(argv[1])));
  
}
#endif


================================================
FILE: src/sleefsimdsp_emulation.c
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>

#ifdef ENABLE_VSX
#include "renamevsx.h"
#define nsimd_vec_f32 nsimd_vmx_vf32
#define get0(a) vec_extract(a, 0)
#define get1(a) vec_extract(a, 1)
#define get2(a) vec_extract(a, 2)
#define get3(a) vec_extract(a, 3)
#define set0(a, b) vec_splats(b)
#define set1(a, b) vec_insert(b, a, 1)
#define set2(a, b) vec_insert(b, a, 2)
#define set3(a, b) vec_insert(b, a, 3)
#endif

nsimd_vec_f32 xsinf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_sin_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}


nsimd_vec_f32 xcosf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cos_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xtanf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_tan_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xasinf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_asin_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xacosf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_acos_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xatanf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_atan_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xatan2f(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_atan2_u35_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlogf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_log_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xcbrtf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cbrt_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xsinf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_sin_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xcosf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cos_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xtanf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_tan_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xasinf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_asin_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xacosf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_acos_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xatanf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_atan_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xatan2f_u1(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_atan2_u10_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlogf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_log_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xcbrtf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cbrt_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xexpf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_exp_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xpowf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_pow_u10_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xsinhf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_sinh_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xcoshf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cosh_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xtanhf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_tanh_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xsinhf_u35(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_sinh_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xcoshf_u35(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cosh_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xtanhf_u35(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_tanh_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xasinhf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_asinh_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xacoshf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_acosh_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xatanhf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_atanh_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xexp2f(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_exp2_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xexp2f_u35(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_exp2_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xexp10f(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_exp10_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xexp10f_u35(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_exp10_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xexpm1f(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_expm1_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlog10f(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_log10_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlog2f(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_log2_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlog2f_u35(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_log2_u35_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlog1pf(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_log1p_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xsinpif_u05(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_sinpi_u05_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xcospif_u05(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_cospi_u05_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xhypotf_u05(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_hypot_u05_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xhypotf_u35(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_hypot_u35_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xfmodf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_fmod_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xremainderf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, a1, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  a1.v0 = get0(a1_);
  a1.v1 = get1(a1_);
  a1.v2 = get2(a1_);
  a1.v3 = get3(a1_);
  ret = nsimd_remainder_cpu_f32(a0, a1);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xlgammaf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_lgamma_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xtgammaf_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_tgamma_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xerff_u1(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_erf_u10_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}

nsimd_vec_f32 xerfcf_u15(nsimd_vec_f32 a0_) {
  nsimd_vec_f32 ret_;
  nsimd_cpu_vf32 a0, ret;
  a0.v0 = get0(a0_);
  a0.v1 = get1(a0_);
  a0.v2 = get2(a0_);
  a0.v3 = get3(a0_);
  ret = nsimd_erfc_u15_cpu_f32(a0);
  ret_ = set0(ret_, ret.v0);
  ret_ = set1(ret_, ret.v1);
  ret_ = set2(ret_, ret.v2);
  ret_ = set3(ret_, ret.v3);
  return ret_;
}


================================================
FILE: src/sleefsp.c
================================================
//   Copyright Naoki Shibata and contributors 2010 - 2020.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

// Always use -ffp-contract=off option to compile SLEEF.

#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <limits.h>
#include <float.h>

#ifndef ENABLE_BUILTIN_MATH
#include <math.h>
#define SQRTF sqrtf
#else
#define SQRTF __builtin_sqrtf
#endif

#include "misc.h"

extern const float Sleef_rempitabsp[];

#ifdef DORENAME
#include "rename.h"
#endif

#if (defined(_MSC_VER))
#pragma fp_contract (off)
#endif

#define MLA mlaf
#define C2V(x) (x)
#include "estrin.h"

static INLINE CONST int32_t floatToRawIntBits(float d) {
  union {
    float f;
    int32_t i;
  } tmp;
  tmp.f = d;
  return tmp.i;
}

static INLINE CONST float intBitsToFloat(int32_t i) {
  union {
    float f;
    int32_t i;
  } tmp;
  tmp.i = i;
  return tmp.f;
}

static INLINE CONST float fabsfk(float x) {
  return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
}

static INLINE CONST float mulsignf(float x, float y) {
  return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
}

static INLINE CONST float copysignfk(float x, float y) {
  return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31)));
}

static INLINE CONST float signf(float d) { return mulsignf(1, d); }
static INLINE CONST float mlaf(float x, float y, float z) { return x * y + z; }
static INLINE CONST float rintfk(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); }
static INLINE CONST int ceilfk(float x) { return (int)x + (x < 0 ? 0 : 1); }
static INLINE CONST float fminfk(float x, float y) { return x < y ? x : y; }
static INLINE CONST float fmaxfk(float x, float y) { return x > y ? x : y; }
static INLINE CONST int xisintf(float x) { return (x == (int)x); }

static INLINE CONST int xisnanf(float x) { return x != x; }
static INLINE CONST int xisinff(float x) { return x == SLEEF_INFINITYf || x == -SLEEF_INFINITYf; }
static INLINE CONST int xisminff(float x) { return x == -SLEEF_INFINITYf; }
static INLINE CONST int xispinff(float x) { return x == SLEEF_INFINITYf; }
static INLINE CONST int xisnegzerof(float x) { return floatToRawIntBits(x) == floatToRawIntBits(-0.0); }
static INLINE CONST int xisnumberf(float x) { return !xisinff(x) && !xisnanf(x); }

static INLINE CONST int ilogbkf(float d) {
  int m = d < 5.421010862427522E-20f;
  d = m ? 1.8446744073709552E19f * d : d;
  int q = (floatToRawIntBits(d) >> 23) & 0xff;
  q = m ? q - (64 + 0x7f) : q - 0x7f;
  return q;
}

// vilogb2kf is similar to ilogbkf, but the argument has to be a
// normalized FP value.
static INLINE CONST int ilogb2kf(float d) {
  return ((floatToRawIntBits(d) >> 23) & 0xff) - 0x7f;
}

EXPORT CONST int xilogbf(float d) {
  int e = ilogbkf(fabsfk(d));
  e = d == 0.0f  ? SLEEF_FP_ILOGB0 : e;
  e = xisnanf(d) ? SLEEF_FP_ILOGBNAN : e;
  e = xisinff(d) ? INT_MAX : e;
  return e;
}

static INLINE CONST float pow2if(int q) {
  return intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
}

static INLINE CONST float ldexpkf(float x, int q) {
  float u;
  int m;
  m = q >> 31;
  m = (((m + q) >> 6) - m) << 4;
  q = q - (m << 2);
  m += 127;
  m = m <   0 ?   0 : m;
  m = m > 255 ? 255 : m;
  u = intBitsToFloat(((int32_t)m) << 23);
  x = x * u * u * u * u;
  u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
  return x * u;
}

static INLINE CONST float ldexp2kf(float d, int e) { // faster than ldexpkf, short reach
  return d * pow2if(e >> 1) * pow2if(e - (e >> 1));
}

static INLINE CONST float ldexp3kf(float d, int e) { // very fast, no denormal
  return intBitsToFloat(floatToRawIntBits(d) + (e << 23));
}

//

#ifndef NDEBUG
static int checkfp(float x) {
  if (xisinff(x) || xisnanf(x)) return 1;
  return 0;
}
#endif

static INLINE CONST float upperf(float d) {
  return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000);
}

static INLINE CONST Sleef_float2 df(float h, float l) {
  Sleef_float2 ret;
  ret.x = h; ret.y = l;
  return ret;
}

static INLINE CONST Sleef_float2 dfx(double d) {
  Sleef_float2 ret;
  ret.x = d; ret.y = d - ret.x;
  return ret;
}

static INLINE CONST Sleef_float2 dfnormalize_f2_f2(Sleef_float2 t) {
  Sleef_float2 s;

  s.x = t.x + t.y;
  s.y = t.x - s.x + t.y;

  return s;
}

static INLINE CONST Sleef_float2 dfscale_f2_f2_f(Sleef_float2 d, float s) {
  Sleef_float2 r;

  r.x = d.x * s;
  r.y = d.y * s;

  return r;
}

static INLINE CONST Sleef_float2 dfneg_f2_f2(Sleef_float2 d) {
  Sleef_float2 r;

  r.x = -d.x;
  r.y = -d.y;

  return r;
}

static INLINE CONST Sleef_float2 dfabs_f2_f2(Sleef_float2 x) {
  return df(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y);
}

static INLINE CONST Sleef_float2 dfadd_f2_f_f(float x, float y) {
  // |x| >= |y|

  Sleef_float2 r;

#ifndef NDEBUG
  if (!(checkfp(x) || checkfp(y) || fabsfk(x) >= fabsfk(y))) fprintf(stderr, "[dfadd_f2_f_f : %g, %g]", x, y);
#endif

  r.x = x + y;
  r.y = x - r.x + y;

  return r;
}

static INLINE CONST Sleef_float2 dfadd2_f2_f_f(float x, float y) {
  Sleef_float2 r;

  r.x = x + y;
  float v = r.x - x;
  r.y = (x - (r.x - v)) + (y - v);

  return r;
}

static INLINE CONST Sleef_float2 dfadd_f2_f2_f(Sleef_float2 x, float y) {
  // |x| >= |y|

  Sleef_float2 r;

#ifndef NDEBUG
  if (!(checkfp(x.x) || checkfp(y) || fabsfk(x.x) >= fabsfk(y))) fprintf(stderr, "[dfadd_f2_f2_f : %g %g]", x.x, y);
#endif

  r.x = x.x + y;
  r.y = x.x - r.x + y + x.y;

  return r;
}

static INLINE CONST Sleef_float2 dfadd_f2_f_f2(float x, Sleef_float2 y) {
  // |x| >= |y|

  Sleef_float2 r;

#ifndef NDEBUG
  if (!(checkfp(x) || checkfp(y.x) || fabsfk(x) >= fabsfk(y.x))) {
    fprintf(stderr, "[dfadd_f2_f_f2 : %g %g]\n", x, y.x);
    fflush(stderr);
  }
#endif

  r.x = x + y.x;
  r.y = x - r.x + y.x + y.y;

  return r;
}

static INLINE CONST Sleef_float2 dfadd2_f2_f2_f(Sleef_float2 x, float y) {
  // |x| >= |y|

  Sleef_float2 r;

  r.x  = x.x + y;
  float v = r.x - x.x;
  r.y = (x.x - (r.x - v)) + (y - v);
  r.y += x.y;

  return r;
}

static INLINE CONST Sleef_float2 dfadd2_f2_f_f2(float x, Sleef_float2 y) {
  Sleef_float2 r;

  r.x  = x + y.x;
  float v = r.x - x;
  r.y = (x - (r.x - v)) + (y.x - v) + y.y;

  return r;
}

static INLINE CONST Sleef_float2 dfadd_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
  // |x| >= |y|

  Sleef_float2 r;

#ifndef NDEBUG
  if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, "[dfadd_f2_f2_f2 : %g %g]", x.x, y.x);
#endif

  r.x = x.x + y.x;
  r.y = x.x - r.x + y.x + x.y + y.y;

  return r;
}

static INLINE CONST Sleef_float2 dfadd2_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
  Sleef_float2 r;

  r.x  = x.x + y.x;
  float v = r.x - x.x;
  r.y = (x.x - (r.x - v)) + (y.x - v);
  r.y += x.y + y.y;

  return r;
}

static INLINE CONST Sleef_float2 dfsub_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
  // |x| >= |y|

  Sleef_float2 r;

#ifndef NDEBUG
  if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, "[dfsub_f2_f2_f2 : %g %g]", x.x, y.x);
#endif

  r.x = x.x - y.x;
  r.y = x.x - r.x - y.x + x.y - y.y;

  return r;
}

static INLINE CONST Sleef_float2 dfdiv_f2_f2_f2(Sleef_float2 n, Sleef_float2 d) {
  float t = 1.0f / d.x;
  float dh  = upperf(d.x), dl  = d.x - dh;
  float th  = upperf(t  ), tl  = t   - th;
  float nhh = upperf(n.x), nhl = n.x - nhh;

  Sleef_float2 q;

  q.x = n.x * t;

  float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);

  q.y = t * (n.y - q.x * d.y) + u;

  return q;
}

static INLINE CONST Sleef_float2 dfmul_f2_f_f(float x, float y) {
  float xh = upperf(x), xl = x - xh;
  float yh = upperf(y), yl = y - yh;
  Sleef_float2 r;

  r.x = x * y;
  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;

  return r;
}

static INLINE CONST Sleef_float2 dfmul_f2_f2_f(Sleef_float2 x, float y) {
  float xh = upperf(x.x), xl = x.x - xh;
  float yh = upperf(y  ), yl = y   - yh;
  Sleef_float2 r;

  r.x = x.x * y;
  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;

  return r;
}

static INLINE CONST Sleef_float2 dfmul_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
  float xh = upperf(x.x), xl = x.x - xh;
  float yh = upperf(y.x), yl = y.x - yh;
  Sleef_float2 r;

  r.x = x.x * y.x;
  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;

  return r;
}

static INLINE CONST float dfmul_f_f2_f2(Sleef_float2 x, Sleef_float2 y) {
  float xh = upperf(x.x), xl = x.x - xh;
  float yh = upperf(y.x), yl = y.x - yh;

  return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh;
}

static INLINE CONST Sleef_float2 dfsqu_f2_f2(Sleef_float2 x) {
  float xh = upperf(x.x), xl = x.x - xh;
  Sleef_float2 r;

  r.x = x.x * x.x;
  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);

  return r;
}

static INLINE CONST float dfsqu_f_f2(Sleef_float2 x) {
  float xh = upperf(x.x), xl = x.x - xh;

  return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh;
}

static INLINE CONST Sleef_float2 dfrec_f2_f(float d) {
  float t = 1.0f / d;
  float dh = upperf(d), dl = d - dh;
  float th = upperf(t), tl = t - th;
  Sleef_float2 q;

  q.x = t;
  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);

  return q;
}

static INLINE CONST Sleef_float2 dfrec_f2_f2(Sleef_float2 d) {
  float t = 1.0f / d.x;
  float dh = upperf(d.x), dl = d.x - dh;
  float th = upperf(t  ), tl = t   - th;
  Sleef_float2 q;

  q.x = t;
  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t);

  return q;
}

static INLINE CONST Sleef_float2 dfsqrt_f2_f2(Sleef_float2 d) {
  float t = SQRTF(d.x + d.y);
  return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f);
}

static INLINE CONST Sleef_float2 dfsqrt_f2_f(float d) {
  float t = SQRTF(d);
  return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5);
}

//

typedef struct {
  float d;
  int32_t i;
} fi_t;

typedef struct {
  Sleef_float2 df;
  int32_t i;
} dfi_t;

static CONST fi_t rempisubf(float x) {
  fi_t ret;
  float fr = x - (float)(INT64_C(1) << 10) * (int32_t)(x * (1.0f / (INT64_C(1) << 10)));
  ret.i = ((7 & ((x > 0 ? 4 : 3) + (int32_t)(fr * 8))) - 3) >> 1;
  fr = fr - 0.25f * (int32_t)(fr * 4 + mulsignf(0.5f, x));
  fr = fabsfk(fr) > 0.125f ? (fr - mulsignf(0.5f, x)) : fr;
  fr = fabsfk(fr) > 1e+10f ? 0 : fr;
  if (fabsfk(x) == 0.12499999254941940308f) { fr = x; ret.i = 0; }
  ret.d = fr;
  return ret;
}

static CONST dfi_t rempif(float a) {
  Sleef_float2 x, y, z;
  fi_t di;
  float t;
  int ex = ilogb2kf(a) - 25, q = ex > (90 - 25) ? -64 : 0;
  a = ldexp3kf(a, q);
  if (ex < 0) ex = 0;
  ex *= 4;
  x = dfmul_f2_f_f(a, Sleef_rempitabsp[ex]);
  di = rempisubf(x.x);
  q = di.i;
  x.x = di.d;
  x = dfnormalize_f2_f2(x);
  y = dfmul_f2_f_f(a, Sleef_rempitabsp[ex+1]);
  x = dfadd2_f2_f2_f2(x, y);
  di = rempisubf(x.x);
  q += di.i;
  x.x = di.d;
  x = dfnormalize_f2_f2(x);
  y = dfmul_f2_f2_f(df(Sleef_rempitabsp[ex+2], Sleef_rempitabsp[ex+3]), a);
  x = dfadd2_f2_f2_f2(x, y);
  x = dfnormalize_f2_f2(x);
  x = dfmul_f2_f2_f2(x, df(3.1415927410125732422f*2, -8.7422776573475857731e-08f*2));
  dfi_t ret = { fabsfk(a) < 0.7f ? df(a, 0) : x, q };
  return ret;
}

EXPORT CONST float xsinf(float d) {
  int q;
  float u, s, t = d;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    q = (int)rintfk(d * (float)M_1_PI);
    d = mlaf(q, -PI_A2f, d);
    d = mlaf(q, -PI_B2f, d);
    d = mlaf(q, -PI_C2f, d);
  } else if (fabsfk(d) < TRIGRANGEMAXf) {
    q = (int)rintfk(d * (float)M_1_PI);
    d = mlaf(q, -PI_Af, d);
    d = mlaf(q, -PI_Bf, d);
    d = mlaf(q, -PI_Cf, d);
    d = mlaf(q, -PI_Df, d);
  } else {
    dfi_t dfi = rempif(t);
    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 1) >> 2;
    if ((dfi.i & 1) != 0) {
      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x),
					  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x)));
    }
    d = dfi.df.x + dfi.df.y;
    if (xisinff(t) || xisnanf(t)) d = SLEEF_NANf;
  }

  s = d * d;

  if ((q & 1) != 0) d = -d;

  u = 2.6083159809786593541503e-06f;
  u = mlaf(u, s, -0.0001981069071916863322258f);
  u = mlaf(u, s, 0.00833307858556509017944336f);
  u = mlaf(u, s, -0.166666597127914428710938f);

  u = mlaf(s, u * d, d);

  if (xisnegzerof(t)) u = -0.0f;

  return u;
}

EXPORT CONST float xsinf_u1(float d) {
  int q;
  float u;
  Sleef_float2 s, t, x;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    q = (int)rintfk(d * (float)M_1_PI);
    u = mlaf(q, -PI_A2f, d);
    s = dfadd2_f2_f_f(u, q * (-PI_B2f));
    s = dfadd_f2_f2_f(s, q * (-PI_C2f));
  } else {
    dfi_t dfi = rempif(d);
    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 1) >> 2;
    if ((dfi.i & 1) != 0) {
      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x),
					  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x)));
    }
    s = dfnormalize_f2_f2(dfi.df);
    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;
  }
  
  t = s;
  s = dfsqu_f2_f2(s);

  u = 2.6083159809786593541503e-06f;
  u = mlaf(u, s.x, -0.0001981069071916863322258f);
  u = mlaf(u, s.x, 0.00833307858556509017944336f);

  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s));

  u = dfmul_f_f2_f2(t, x);

  if ((q & 1) != 0) u = -u;
  if (xisnegzerof(d)) u = d;

  return u;
}

EXPORT CONST float xcosf(float d) {
  int q;
  float u, s, t = d;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f);
    d = mlaf(q, -PI_A2f*0.5f, d);
    d = mlaf(q, -PI_B2f*0.5f, d);
    d = mlaf(q, -PI_C2f*0.5f, d);
  } else if (fabsfk(d) < TRIGRANGEMAXf) {
    q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f);
    d = mlaf(q, -PI_Af*0.5f, d);
    d = mlaf(q, -PI_Bf*0.5f, d);
    d = mlaf(q, -PI_Cf*0.5f, d);
    d = mlaf(q, -PI_Df*0.5f, d);
  } else {
    dfi_t dfi = rempif(t);
    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 7) >> 1;
    if ((dfi.i & 1) == 0) {
      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x > 0 ? 1 : -1),
					  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x > 0 ? 1 : -1)));
    }
    d = dfi.df.x + dfi.df.y;
    if (xisinff(t) || xisnanf(t)) d = SLEEF_NANf;
  }

  s = d * d;

  if ((q & 2) == 0) d = -d;

  u = 2.6083159809786593541503e-06f;
  u = mlaf(u, s, -0.0001981069071916863322258f);
  u = mlaf(u, s, 0.00833307858556509017944336f);
  u = mlaf(u, s, -0.166666597127914428710938f);

  u = mlaf(s, u * d, d);
  
  return u;
}

EXPORT CONST float xcosf_u1(float d) {
  float u;
  Sleef_float2 s, t, x;
  int q;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    d = fabsfk(d);
    float dq = mlaf(rintfk(d * (float)M_1_PI - 0.5f), 2, 1);
    q = (int)dq;
    s = dfadd2_f2_f_f (d, dq * (-PI_A2f*0.5f));
    s = dfadd2_f2_f2_f(s, dq * (-PI_B2f*0.5f));
    s = dfadd2_f2_f2_f(s, dq * (-PI_C2f*0.5f));
  } else {
    dfi_t dfi = rempif(d);
    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 7) >> 1;
    if ((dfi.i & 1) == 0) {
      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x > 0 ? 1 : -1),
					  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x > 0 ? 1 : -1)));
    }
    s = dfnormalize_f2_f2(dfi.df);
    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;
  }
  
  t = s;
  s = dfsqu_f2_f2(s);

  u = 2.6083159809786593541503e-06f;
  u = mlaf(u, s.x, -0.0001981069071916863322258f);
  u = mlaf(u, s.x, 0.00833307858556509017944336f);

  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s));

  u = dfmul_f_f2_f2(t, x);

  if ((((int)q) & 2) == 0) u = -u;

  return u;
}

EXPORT CONST float xfastsinf_u3500(float d) {
  int q;
  float u, s, t = d;

  q = rintfk(d * (float)M_1_PI);
  d = mlaf(q, -(float)M_PI, d);

  s = d * d;

  u = -0.1881748176e-3;
  u = mlaf(u, s, +0.8323502727e-2);
  u = mlaf(u, s, -0.1666651368e+0);
  u = mlaf(s * d, u, d);

  if ((q & 1) != 0) u = -u;

  if (UNLIKELY(fabsfk(t) > 30.0f)) return xsinf(t);

  return u;
}

EXPORT CONST float xfastcosf_u3500(float d) {
  int q;
  float u, s, t = d;

  q = rintfk(mlaf(d, (float)M_1_PI, -0.5f));
  d = mlaf(q, -(float)M_PI, d - (float)M_PI*0.5f);

  s = d * d;

  u = -0.1881748176e-3;
  u = mlaf(u, s, +0.8323502727e-2);
  u = mlaf(u, s, -0.1666651368e+0);
  u = mlaf(s * d, u, d);

  if ((q & 1) == 0) u = -u;

  if (UNLIKELY(fabsfk(t) > 30.0f)) return xcosf(t);

  return u;
}

EXPORT CONST Sleef_float2 xsincosf(float d) {
  int q;
  float u, s, t;
  Sleef_float2 r;

  s = d;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    q = (int)rintfk(d * ((float)(2 * M_1_PI)));
    s = mlaf(q, -PI_A2f*0.5f, s);
    s = mlaf(q, -PI_B2f*0.5f, s);
    s = mlaf(q, -PI_C2f*0.5f, s);
  } else if (fabsfk(d) < TRIGRANGEMAXf) {
    q = (int)rintfk(d * ((float)(2 * M_1_PI)));
    s = mlaf(q, -PI_Af*0.5f, s);
    s = mlaf(q, -PI_Bf*0.5f, s);
    s = mlaf(q, -PI_Cf*0.5f, s);
    s = mlaf(q, -PI_Df*0.5f, s);
  } else {
    dfi_t dfi = rempif(d);
    q = dfi.i;
    s = dfi.df.x + dfi.df.y;
    if (xisinff(d) || xisnanf(d)) s = SLEEF_NANf;
  }

  t = s;

  s = s * s;

  u = -0.000195169282960705459117889f;
  u = mlaf(u, s, 0.00833215750753879547119141f);
  u = mlaf(u, s, -0.166666537523269653320312f);
  u = u * s * t;

  r.x = t + u;

  if (xisnegzerof(d)) r.x = -0.0f;
  
  u = -2.71811842367242206819355e-07f;
  u = mlaf(u, s, 2.47990446951007470488548e-05f);
  u = mlaf(u, s, -0.00138888787478208541870117f);
  u = mlaf(u, s, 0.0416666641831398010253906f);
  u = mlaf(u, s, -0.5f);

  r.y = u * s + 1;

  if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
  if ((q & 2) != 0) { r.x = -r.x; }
  if (((q+1) & 2) != 0) { r.y = -r.y; }

  return r;
}

EXPORT CONST Sleef_float2 xsincosf_u1(float d) {
  int q;
  float u;
  Sleef_float2 r, s, t, x;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    q = (int)rintfk(d * (float)(2 * M_1_PI));
    u = mlaf(q, -PI_A2f*0.5f, d);
    s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f));
    s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f));
  } else {
    dfi_t dfi = rempif(d);
    q = dfi.i;
    s = dfi.df;
    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;
  }
  
  t = s;
  s.x = dfsqu_f_f2(s);

  u = -0.000195169282960705459117889f;
  u = mlaf(u, s.x, 0.00833215750753879547119141f);
  u = mlaf(u, s.x, -0.166666537523269653320312f);

  u *= s.x * t.x;

  x = dfadd_f2_f2_f(t, u);
  r.x = x.x + x.y;
  if (xisnegzerof(d)) r.x = -0.0f;

  u = -2.71811842367242206819355e-07f;
  u = mlaf(u, s.x, 2.47990446951007470488548e-05f);
  u = mlaf(u, s.x, -0.00138888787478208541870117f);
  u = mlaf(u, s.x, 0.0416666641831398010253906f);
  u = mlaf(u, s.x, -0.5f);

  x = dfadd_f2_f_f2(1, dfmul_f2_f_f(s.x, u));
  r.y = x.x + x.y;

  if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }
  if ((q & 2) != 0) { r.x = -r.x; }
  if (((q+1) & 2) != 0) { r.y = -r.y; }

  return r;
}

EXPORT CONST Sleef_float2 xsincospif_u05(float d) {
  float u, s, t;
  Sleef_float2 r, x, s2;

  u = d * 4;
  int q = ceilfk(u) & ~(int)1;
  
  s = u - (float)q;
  t = s;
  s = s * s;
  s2 = dfmul_f2_f_f(t, t);

  //

  u = +0.3093842054e-6;
  u = mlaf(u, s, -0.3657307388e-4);
  u = mlaf(u, s, +0.2490393585e-2);
  x = dfadd2_f2_f_f2(u * s, df(-0.080745510756969451904, -1.3373665339076936258e-09));
  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(0.78539818525314331055, -2.1857338617566484855e-08));

  x = dfmul_f2_f2_f(x, t);
  r.x = x.x + x.y;
  if (xisnegzerof(d)) r.x = -0.0f;

  u = -0.2430611801e-7;
  u = mlaf(u, s, +0.3590577080e-5);
  u = mlaf(u, s, -0.3259917721e-3);
  x = dfadd2_f2_f_f2(u * s, df(0.015854343771934509277, 4.4940051354032242811e-10));
  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(-0.30842512845993041992, -9.0728339030733922277e-09));
  
  x = dfadd2_f2_f2_f(dfmul_f2_f2_f2(x, s2), 1);
  r.y = x.x + x.y;

  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
  if ((q & 4) != 0) { r.x = -r.x; }
  if (((q+2) & 4) != 0) { r.y = -r.y; }

  if (fabsfk(d) > 1e+7f) { r.x = 0; r.y = 1; }
  if (xisinff(d)) { r.x = r.y = SLEEF_NANf; }

  return r;
}

EXPORT CONST Sleef_float2 xsincospif_u35(float d) {
  float u, s, t;
  Sleef_float2 r;

  u = d * 4;
  int q = ceilfk(u) & ~(int)1;
  
  s = u - (float)q;
  t = s;
  s = s * s;

  //

  u = -0.3600925265e-4;
  u = mlaf(u, s, +0.2490088111e-2);
  u = mlaf(u, s, -0.8074551076e-1);
  u = mlaf(u, s, +0.7853981853e+0);

  r.x = u * t;

  u = +0.3539815225e-5;
  u = mlaf(u, s, -0.3259574005e-3);
  u = mlaf(u, s, +0.1585431583e-1);
  u = mlaf(u, s, -0.3084251285e+0);
  u = mlaf(u, s, 1);

  r.y = u;

  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
  if ((q & 4) != 0) { r.x = -r.x; }
  if (((q+2) & 4) != 0) { r.y = -r.y; }

  if (fabsfk(d) > 1e+7f) { r.x = 0; r.y = 1; }
  if (xisinff(d)) { r.x = r.y = SLEEF_NANf; }

  return r;
}

EXPORT CONST float xtanf(float d) {
  int q;
  float u, s, x;

  x = d;

  if (fabsfk(d) < TRIGRANGEMAX2f*0.5f) {
    q = (int)rintfk(d * (float)(2 * M_1_PI));
    x = mlaf(q, -PI_A2f*0.5f, x);
    x = mlaf(q, -PI_B2f*0.5f, x);
    x = mlaf(q, -PI_C2f*0.5f, x);
  } else if (fabsfk(d) < TRIGRANGEMAXf) {
    q = (int)rintfk(d * (float)(2 * M_1_PI));
    x = mlaf(q, -PI_Af*0.5f, x);
    x = mlaf(q, -PI_Bf*0.5f, x);
    x = mlaf(q, -PI_Cf*0.5f, x);
    x = mlaf(q, -PI_Df*0.5f, x);
  } else {
    dfi_t dfi = rempif(d);
    q = dfi.i;
    x = dfi.df.x + dfi.df.y;
    if (xisinff(d) || xisnanf(d)) x = SLEEF_NANf;
  }

  s = x * x;

  if ((q & 1) != 0) x = -x;

  float s2 = s * s, s4 = s2 * s2;
  u = POLY6(s, s2, s4,
	    0.00927245803177356719970703f,
	    0.00331984995864331722259521f,
	    0.0242998078465461730957031f,
	    0.0534495301544666290283203f,
	    0.133383005857467651367188f,
	    0.333331853151321411132812f);

  u = mlaf(s, u * x, x);

  if ((q & 1) != 0) u = 1.0f / u;

  return u;
}

EXPORT CONST float xtanf_u1(float d) {
  int q;
  float u;
  Sleef_float2 s, t, x;

  if (fabsfk(d) < TRIGRANGEMAX2f) {
    q = (int)rintfk(d * (float)(2 * M_1_PI));
    u = mlaf(q, -PI_A2f*0.5f, d);
    s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f));
    s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f));
  } else {
    dfi_t dfi = rempif(d);
    q = dfi.i;
    s = dfi.df;
    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;
  }

  if ((q & 1) != 0) s = dfneg_f2_f2(s);

  t = s;
  s = dfsqu_f2_f2(s);
  s = dfnormalize_f2_f2(s);

  u = 0.00446636462584137916564941f;
  u = mlaf(u, s.x, -8.3920182078145444393158e-05f);
  u = mlaf(u, s.x, 0.0109639242291450500488281f);
  u = mlaf(u, s.x, 0.0212360303848981857299805f);
  u = mlaf(u, s.x, 0.0540687143802642822265625f);

  x = dfadd_f2_f_f(0.133325666189193725585938f, u * s.x);
  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f2(0.33333361148834228515625f, dfmul_f2_f2_f2(s, x)), s));
  x = dfmul_f2_f2_f2(t, x);

  if ((q & 1) != 0) x = dfrec_f2_f2(x);

  u = x.x + x.y;

  if (xisnegzerof(d)) u = -0.0f;

  return u;
}

EXPORT CONST float xatanf(float s) {
  float t, u;
  int q = 0;

  if (signf(s) == -1) { s = -s; q = 2; }
  if (s > 1) { s = 1.0f / s; q |= 1; }

  t = s * s;

  float t2 = t * t, t4 = t2 * t2;
  u = POLY8(t, t2, t4,
	    0.00282363896258175373077393f,
	    -0.0159569028764963150024414f,
	    0.0425049886107444763183594f,
	    -0.0748900920152664184570312f,
	    0.106347933411598205566406f,
	    -0.142027363181114196777344f,
	    0.199926957488059997558594f,
	    -0.333331018686294555664062f);

  t = s + s * (t * u);

  if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
  if ((q & 2) != 0) t = -t;

  return t;
}

static INLINE CONST float atan2kf(float y, float x) {
  float s, t, u;
  int q = 0;

  if (x < 0) { x = -x; q = -2; }
  if (y > x) { t = x; x = y; y = -t; q += 1; }

  s = y / x;
  t = s * s;

  float t2 = t * t, t4 = t2 * t2;
  u = POLY8(t, t2, t4,
	    0.00282363896258175373077393f,
	    -0.0159569028764963150024414f,
	    0.0425049886107444763183594f,
	    -0.0748900920152664184570312f,
	    0.106347933411598205566406f,
	    -0.142027363181114196777344f,
	    0.199926957488059997558594f,
	    -0.333331018686294555664062f);

  t = u * t * s + s;
  t = q * (float)(M_PI/2) + t;

  return t;
}

EXPORT CONST float xatan2f(float y, float x) {
  float r = atan2kf(fabsfk(y), x);

  r = mulsignf(r, x);
  if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0);
  if (xisinff(y)          ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0);
  if (              y == 0) r = (signf(x) == -1 ? M_PIf : 0);

  return xisnanf(x) || xisnanf(y) ? SLEEF_NANf : mulsignf(r, y);
}

EXPORT CONST float xasinf(float d) {
  int o = fabsfk(d) < 0.5f;
  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), x = o ? fabsfk(d) : SQRTF(x2), u;

  u = +0.4197454825e-1;
  u = mlaf(u, x2, +0.2424046025e-1);
  u = mlaf(u, x2, +0.4547423869e-1);
  u = mlaf(u, x2, +0.7495029271e-1);
  u = mlaf(u, x2, +0.1666677296e+0);
  u = mlaf(u, x * x2, x);
  
  float r = o ? u : (M_PIf/2 - 2*u);
  r = mulsignf(r, d);

  return r;
}

EXPORT CONST float xacosf(float d) {
  int o = fabsfk(d) < 0.5f;
  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;
  float x = o ? fabsfk(d) : SQRTF(x2);
  x = fabsfk(d) == 1.0 ? 0 : x;

  u = +0.4197454825e-1;
  u = mlaf(u, x2, +0.2424046025e-1);
  u = mlaf(u, x2, +0.4547423869e-1);
  u = mlaf(u, x2, +0.7495029271e-1);
  u = mlaf(u, x2, +0.1666677296e+0);

  u *= x * x2;
  
  float y = 3.1415926535897932f/2 - (mulsignf(x, d) + mulsignf(u, d));
  x += u;
  float r = o ? y : (x*2);
  if (!o && d < 0) r = dfadd_f2_f2_f(df(3.1415927410125732422f,-8.7422776573475857731e-08f), -r).x;

  return r;
}

static Sleef_float2 atan2kf_u1(Sleef_float2 y, Sleef_float2 x) {
  float u;
  Sleef_float2 s, t;
  int q = 0;

  if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; }
  if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; }

  s = dfdiv_f2_f2_f2(y, x);
  t = dfsqu_f2_f2(s);
  t = dfnormalize_f2_f2(t);

  u = -0.00176397908944636583328247f;
  u = mlaf(u, t.x, 0.0107900900766253471374512f);
  u = mlaf(u, t.x, -0.0309564601629972457885742f);
  u = mlaf(u, t.x, 0.0577365085482597351074219f);
  u = mlaf(u, t.x, -0.0838950723409652709960938f);
  u = mlaf(u, t.x, 0.109463557600975036621094f);
  u = mlaf(u, t.x, -0.142626821994781494140625f);
  u = mlaf(u, t.x, 0.199983194470405578613281f);

  t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x));
  t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t));
  t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t);

  return t;
}

EXPORT CONST float xatan2f_u1(float y, float x) {
  if (fabsfk(x) < 2.9387372783541830947e-39f) { y *= (UINT64_C(1) << 24); x *= (UINT64_C(1) << 24); } // nexttowardf((1.0 / FLT_MAX), 1)
  Sleef_float2 d = atan2kf_u1(df(fabsfk(y), 0), df(x, 0));
  float r = d.x + d.y;

  r = mulsignf(r, x);
  if (xisinff(x) || x == 0) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0.0f);
  if (xisinff(y)          ) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0.0f);
  if (              y == 0) r = (signf(x) == -1 ? (float)M_PI : 0.0f);

  return xisnanf(x) || xisnanf(y) ? SLEEF_NANf : mulsignf(r, y);
}

EXPORT CONST float xasinf_u1(float d) {
  int o = fabsfk(d) < 0.5f;
  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;
  Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2);
  x = fabsfk(d) == 1.0f ? df(0, 0) : x;

  u = +0.4197454825e-1;
  u = mlaf(u, x2, +0.2424046025e-1);
  u = mlaf(u, x2, +0.4547423869e-1);
  u = mlaf(u, x2, +0.7495029271e-1);
  u = mlaf(u, x2, +0.1666677296e+0);
  u *= x2 * x.x;
  
  Sleef_float2 y = dfadd_f2_f2_f(dfsub_f2_f2_f2(df(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), -u);
  float r = o ? (u + x.x) : ((y.x + y.y)*2);
  r = mulsignf(r, d);

  return r;
}

EXPORT CONST float xacosf_u1(float d) {
  int o = fabsfk(d) < 0.5f;
  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;
  Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2);
  x = fabsfk(d) == 1.0 ? df(0, 0) : x;
  
  u = +0.4197454825e-1;
  u = mlaf(u, x2, +0.2424046025e-1);
  u = mlaf(u, x2, +0.4547423869e-1);
  u = mlaf(u, x2, +0.7495029271e-1);
  u = mlaf(u, x2, +0.1666677296e+0);

  u = u * x.x * x2;

  Sleef_float2 y = dfsub_f2_f2_f2(df(3.1415927410125732422f/2,-8.7422776573475857731e-08f/2),
				  dfadd_f2_f_f(mulsignf(x.x, d), mulsignf(u, d)));
  x = dfadd_f2_f2_f(x, u);
  y = o ? y : dfscale_f2_f2_f(x, 2);
  if (!o && d < 0) y = dfsub_f2_f2_f2(df(3.1415927410125732422f,-8.7422776573475857731e-08f), y);
  
  return y.x + y.y;
}

EXPORT CONST float xatanf_u1(float d) {
  Sleef_float2 d2 = atan2kf_u1(df(fabsfk(d), 0.0f), df(1.0f, 0.0f));
  float r = d2.x + d2.y;
  if (xisinff(d)) r = 1.570796326794896557998982f;
  return mulsignf(r, d);
}

EXPORT CONST float xlogf(float d) {
  float x, x2, t, m;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;
    
  x = (m-1.0f) / (m+1.0f);
  x2 = x * x;

  t = 0.2392828464508056640625f;
  t = mlaf(t, x2, 0.28518211841583251953125f);
  t = mlaf(t, x2, 0.400005877017974853515625f);
  t = mlaf(t, x2, 0.666666686534881591796875f);
  t = mlaf(t, x2, 2.0f);

  x = x * t + 0.693147180559945286226764f * e;
  
  if (xisinff(d)) x = SLEEF_INFINITYf;
  if (d < 0 || xisnanf(d)) x = SLEEF_NANf;
  if (d == 0) x = -SLEEF_INFINITYf;

  return x;
}

EXPORT CONST float xexpf(float d) {
  int q = (int)rintfk(d * R_LN2f);
  float s, u;

  s = mlaf(q, -L2Uf, d);
  s = mlaf(q, -L2Lf, s);

  u = 0.000198527617612853646278381;
  u = mlaf(u, s, 0.00139304355252534151077271);
  u = mlaf(u, s, 0.00833336077630519866943359);
  u = mlaf(u, s, 0.0416664853692054748535156);
  u = mlaf(u, s, 0.166666671633720397949219);
  u = mlaf(u, s, 0.5);
  
  u = s * s * u + s + 1.0f;
  u = ldexp2kf(u, q);

  if (d < -104) u = 0;
  if (d >  104) u = SLEEF_INFINITYf;

  return u;
}

static INLINE CONST float expkf(Sleef_float2 d) {
  int q = (int)rintfk((d.x + d.y) * R_LN2f);
  Sleef_float2 s, t;
  float u;

  s = dfadd2_f2_f2_f(d, q * -L2Uf);
  s = dfadd2_f2_f2_f(s, q * -L2Lf);

  s = dfnormalize_f2_f2(s);

  u = 0.00136324646882712841033936f;
  u = mlaf(u, s.x, 0.00836596917361021041870117f);
  u = mlaf(u, s.x, 0.0416710823774337768554688f);
  u = mlaf(u, s.x, 0.166665524244308471679688f);
  u = mlaf(u, s.x, 0.499999850988388061523438f);

  t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u));

  t = dfadd_f2_f_f2(1, t);

  u = ldexpkf(t.x + t.y, q);

  if (d.x < -104) u = 0;
  
  return u;
}

static INLINE CONST float expm1kf(float d) {
  int q = (int)rintfk(d * R_LN2f);
  float s, u;

  s = mlaf(q, -L2Uf, d);
  s = mlaf(q, -L2Lf, s);

  float s2 = s * s, s4 = s2 * s2;
  u = POLY6(s, s2, s4,
	    0.000198527617612853646278381,
	    0.00139304355252534151077271,
	    0.00833336077630519866943359,
	    0.0416664853692054748535156,
	    0.166666671633720397949219,
	    0.5);

  u = s * s * u + s;

  if (q != 0) u = ldexp2kf(u + 1, q) - 1;

  return u;
}

static INLINE CONST Sleef_float2 logkf(float d) {
  Sleef_float2 x, x2, s;
  float m, t;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;
  
  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
  x2 = dfsqu_f2_f2(x);
  
  t = 0.240320354700088500976562;
  t = mlaf(t, x2.x, 0.285112679004669189453125);
  t = mlaf(t, x2.x, 0.400007992982864379882812);
  Sleef_float2 c = df(0.66666662693023681640625f, 3.69183861259614332084311e-09f);

  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e);
  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(dfmul_f2_f2_f2(x2, x),
				       dfadd2_f2_f2_f2(dfmul_f2_f2_f(x2, t), c)));
  return s;
}

EXPORT CONST float xlogf_u1(float d) {
  Sleef_float2 x, s;
  float m, t, x2;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;
  
  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
  x2 = x.x * x.x;

  t = +0.3027294874e+0f;
  t = mlaf(t, x2, +0.3996108174e+0f);
  t = mlaf(t, x2, +0.6666694880e+0f);

  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e);
  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
  s = dfadd_f2_f2_f(s, x2 * x.x * t);

  float r = s.x + s.y;
  
  if (xisinff(d)) r = SLEEF_INFINITYf;
  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;
  if (d == 0) r = -SLEEF_INFINITYf;

  return r;
}

static INLINE CONST Sleef_float2 expk2f(Sleef_float2 d) {
  int q = (int)rintfk((d.x + d.y) * R_LN2f);
  Sleef_float2 s, t;
  float u;

  s = dfadd2_f2_f2_f(d, q * -L2Uf);
  s = dfadd2_f2_f2_f(s, q * -L2Lf);

  u = +0.1980960224e-3f;
  u = mlaf(u, s.x, +0.1394256484e-2f);
  u = mlaf(u, s.x, +0.8333456703e-2f);
  u = mlaf(u, s.x, +0.4166637361e-1f);

  t = dfadd2_f2_f2_f(dfmul_f2_f2_f(s, u), +0.166666659414234244790680580464e+0f);
  t = dfadd2_f2_f2_f(dfmul_f2_f2_f2(s, t), 0.5);
  t = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f2(dfsqu_f2_f2(s), t));

  t = dfadd2_f2_f_f2(1, t);
    
  t.x = ldexp2kf(t.x, q);
  t.y = ldexp2kf(t.y, q);
  
  return d.x < -104 ? df(0, 0) : t;
}

EXPORT CONST float xpowf(float x, float y) {
  int yisint = (y == (int)y) || (fabsfk(y) >= (float)(INT64_C(1) << 24));
  int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(INT64_C(1) << 24);

  float result = expkf(dfmul_f2_f2_f(logkf(fabsfk(x)), y));

  result = xisnanf(result) ? SLEEF_INFINITYf : result;
  result *=  (x >= 0 ? 1 : (!yisint ? SLEEF_NANf : (yisodd ? -1 : 1)));

  float efx = mulsignf(fabsfk(x) - 1, y);
  if (xisinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : SLEEF_INFINITYf);
  if (xisinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : SLEEF_INFINITYf);
  if (xisnanf(x) || xisnanf(y)) result = SLEEF_NANf;
  if (y == 0 || x == 1) result = 1;

  return result;
}

static INLINE CONST float logk3f(float d) {
  float x, x2, t, m;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;
  
  x = (m-1) / (m+1);
  x2 = x * x;
  
  t = 0.2392828464508056640625f;
  t = mlaf(t, x2, 0.28518211841583251953125f);
  t = mlaf(t, x2, 0.400005877017974853515625f);
  t = mlaf(t, x2, 0.666666686534881591796875f);
  t = mlaf(t, x2, 2.0f);

  x = mlaf(x, t, 0.693147180559945286226764f * e);

  return x;
}

static INLINE CONST float expk3f(float d) {
  int q = (int)rintfk(d * R_LN2f);
  float s, u;

  s = mlaf(q, -L2Uf, d);
  s = mlaf(q, -L2Lf, s);

  u = 0.000198527617612853646278381;
  u = mlaf(u, s, 0.00139304355252534151077271);
  u = mlaf(u, s, 0.00833336077630519866943359);
  u = mlaf(u, s, 0.0416664853692054748535156);
  u = mlaf(u, s, 0.166666671633720397949219);
  u = mlaf(u, s, 0.5);

  u = mlaf(s * s, u, s + 1.0f);
  u = ldexpkf(u, q);

  if (d < -104) u = 0;
  
  return u;
}

EXPORT CONST float xfastpowf_u3500(float x, float y) {
  float result = expk3f(logk3f(fabsfk(x)) * y);

  int yisint = (y == (int)y) || (fabsfk(y) >= (float)(INT64_C(1) << 24));
  int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(INT64_C(1) << 24);

  result *= (x < 0 && yisodd) ? -1 : 1;
  if (x == 0) result = 0;
  if (y == 0) result = 1;

  return result;
}

EXPORT CONST float xsinhf(float x) {
  float y = fabsfk(x);
  Sleef_float2 d = expk2f(df(y, 0));
  d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d));
  y = (d.x + d.y) * 0.5f;

  y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y;
  y = xisnanf(y) ? SLEEF_INFINITYf : y;
  y = mulsignf(y, x);
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xcoshf(float x) {
  float y = fabsfk(x);
  Sleef_float2 d = expk2f(df(y, 0));
  d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d));
  y = (d.x + d.y) * 0.5f;

  y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y;
  y = xisnanf(y) ? SLEEF_INFINITYf : y;
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xtanhf(float x) {
  float y = fabsfk(x);
  Sleef_float2 d = expk2f(df(y, 0));
  Sleef_float2 e = dfrec_f2_f2(d);
  d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e));
  y = d.x + d.y;

  y = fabsfk(x) > 18.714973875f ? 1.0f : y;
  y = xisnanf(y) ? 1.0f : y;
  y = mulsignf(y, x);
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xsinhf_u35(float x) {
  float e = expm1kf(fabsfk(x));
  float y = (e + 2) / (e + 1) * (0.5f * e);

  y = fabsfk(x) > 88 ? SLEEF_INFINITYf : y;
  y = xisnanf(y) ? SLEEF_INFINITYf : y;
  y = mulsignf(y, x);
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xcoshf_u35(float x) {
  float e = xexpf(fabsfk(x));
  float y = 0.5f * e + 0.5f / e;

  y = fabsfk(x) > 88 ? SLEEF_INFINITYf : y;
  y = xisnanf(y) ? SLEEF_INFINITYf : y;
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xtanhf_u35(float x) {
  float y = fabsfk(x);
  float d = expm1kf(2*y);
  y = d / (d + 2);

  y = fabsfk(x) > 18.714973875f ? 1.0f : y;
  y = xisnanf(y) ? 1.0f : y;
  y = mulsignf(y, x);
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

static INLINE CONST Sleef_float2 logk2f(Sleef_float2 d) {
  Sleef_float2 x, x2, m, s;
  float t;
  int e;

  e = ilogbkf(d.x * (1.0f/0.75f));
  m = dfscale_f2_f2_f(d, pow2if(-e));

  x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1));
  x2 = dfsqu_f2_f2(x);

  t = 0.2392828464508056640625f;
  t = mlaf(t, x2.x, 0.28518211841583251953125f);
  t = mlaf(t, x2.x, 0.400005877017974853515625f);
  t = mlaf(t, x2.x, 0.666666686534881591796875f);

  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e);
  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t));

  return s;
}

EXPORT CONST float xasinhf(float x) {
  float y = fabsfk(x);
  Sleef_float2 d;

  d = y > 1 ? dfrec_f2_f(x) : df(y, 0);
  d = dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(d), 1));
  d = y > 1 ? dfmul_f2_f2_f(d, y) : d;
  
  d = logk2f(dfnormalize_f2_f2(dfadd_f2_f2_f(d, x)));
  y = d.x + d.y;
  
  y = (fabsfk(x) > SQRT_FLT_MAX || xisnanf(y)) ? mulsignf(SLEEF_INFINITYf, x) : y;
  y = xisnanf(x) ? SLEEF_NANf : y;
  y = xisnegzerof(x) ? -0.0f : y;

  return y;
}

EXPORT CONST float xacoshf(float x) {
  Sleef_float2 d = logk2f(dfadd2_f2_f2_f(dfmul_f2_f2_f2(dfsqrt_f2_f2(dfadd2_f2_f_f(x, 1)), dfsqrt_f2_f2(dfadd2_f2_f_f(x, -1))), x));
  float y = d.x + d.y;

  y = (x > SQRT_FLT_MAX || xisnanf(y)) ? SLEEF_INFINITYf : y;
  y = x == 1.0f ? 0.0f : y;
  y = x < 1.0f ? SLEEF_NANf : y;
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xatanhf(float x) {
  float y = fabsfk(x);
  Sleef_float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y)));
  y = y > 1.0f ? SLEEF_NANf : (y == 1.0f ? SLEEF_INFINITYf : (d.x + d.y) * 0.5f);

  y = xisinff(x) || xisnanf(y) ? SLEEF_NANf : y;
  y = mulsignf(y, x);
  y = xisnanf(x) ? SLEEF_NANf : y;

  return y;
}

EXPORT CONST float xexp2f(float d) {
  int q = (int)rintfk(d);
  float s, u;

  s = d - q;

  u = +0.1535920892e-3;
  u = mlaf(u, s, +0.1339262701e-2);
  u = mlaf(u, s, +0.9618384764e-2);
  u = mlaf(u, s, +0.5550347269e-1);
  u = mlaf(u, s, +0.2402264476e+0);
  u = mlaf(u, s, +0.6931471825e+0);
  u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f_f(u, s))).x;

  u = ldexp2kf(u, q);

  if (d >= 128) u = SLEEF_INFINITYf;
  if (d < -150) u = 0;
  
  return u;
}

EXPORT CONST float xexp2f_u35(float d) {
  int q = (int)rintfk(d);
  float s, u;

  s = d - q;

  u = +0.1535920892e-3;
  u = mlaf(u, s, +0.1339262701e-2);
  u = mlaf(u, s, +0.9618384764e-2);
  u = mlaf(u, s, +0.5550347269e-1);
  u = mlaf(u, s, +0.2402264476e+0);
  u = mlaf(u, s, +0.6931471825e+0);
  u = mlaf(u, s, +0.1000000000e+1);

  u = ldexp2kf(u, q);

  if (d >= 128) u = SLEEF_INFINITYf;
  if (d < -150) u = 0;
  
  return u;
}

EXPORT CONST float xexp10f(float d) {
  int q = (int)rintfk(d * (float)LOG10_2);
  float s, u;
  
  s = mlaf(q, -L10Uf, d);
  s = mlaf(q, -L10Lf, s);
  
  u = +0.6802555919e-1;
  u = mlaf(u, s, +0.2078080326e+0);
  u = mlaf(u, s, +0.5393903852e+0);
  u = mlaf(u, s, +0.1171245337e+1);
  u = mlaf(u, s, +0.2034678698e+1);
  u = mlaf(u, s, +0.2650949001e+1);
  Sleef_float2 x = dfadd_f2_f2_f(df(2.3025851249694824219, -3.1705172516493593157e-08), u * s);
  u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f2_f(x, s))).x;

  u = ldexp2kf(u, q);

  if (d > 38.5318394191036238941387f) u = SLEEF_INFINITYf; // log10(FLT_MAX)
  if (d < -50) u = 0;
  
  return u;
}

EXPORT CONST float xexp10f_u35(float d) {
  int q = (int)rintfk(d * (float)LOG10_2);
  float s, u;
  
  s = mlaf(q, -L10Uf, d);
  s = mlaf(q, -L10Lf, s);
  
  u = +0.2064004987e+0;
  u = mlaf(u, s, +0.5417877436e+0);
  u = mlaf(u, s, +0.1171286821e+1);
  u = mlaf(u, s, +0.2034656048e+1);
  u = mlaf(u, s, +0.2650948763e+1);
  u = mlaf(u, s, +0.2302585125e+1);
  u = mlaf(u, s, +0.1000000000e+1);

  u = ldexp2kf(u, q);

  if (d > 38.5318394191036238941387f) u = SLEEF_INFINITYf; // log10(FLT_MAX)
  if (d < -50) u = 0;
  
  return u;
}

EXPORT CONST float xexpm1f(float a) {
  Sleef_float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f);
  float x = d.x + d.y;
  if (a > 88.72283172607421875f) x = SLEEF_INFINITYf;
  if (a < -16.635532333438687426013570f) x = -1;
  if (xisnegzerof(a)) x = -0.0f;
  return x;
}

EXPORT CONST float xlog10f(float d) {
  Sleef_float2 x, s;
  float m, t, x2;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;

  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
  x2 = x.x * x.x;

  t = +0.1314289868e+0;
  t = mlaf(t, x2, +0.1735493541e+0);
  t = mlaf(t, x2, +0.2895309627e+0);
    
  s = dfmul_f2_f2_f(df(0.30103001, -1.432098889e-08), (float)e);
  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(x, df(0.868588984, -2.170757285e-08)));
  s = dfadd_f2_f2_f(s, x2 * x.x * t);

  float r = s.x + s.y;
  
  if (xisinff(d)) r = SLEEF_INFINITYf;
  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;
  if (d == 0) r = -SLEEF_INFINITYf;

  return r;
}

EXPORT CONST float xlog2f(float d) {
  Sleef_float2 x, s;
  float m, t, x2;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;

  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
  x2 = x.x * x.x;

  t = +0.4374550283e+0f;
  t = mlaf(t, x2, +0.5764790177e+0f);
  t = mlaf(t, x2, +0.9618012905120f);

  s = dfadd2_f2_f_f2(e, dfmul_f2_f2_f2(x, df(2.8853900432586669922, 3.2734474483568488616e-08)));
  s = dfadd2_f2_f2_f(s, x2 * x.x * t);
  
  float r = s.x + s.y;
  
  if (xisinff(d)) r = SLEEF_INFINITYf;
  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;
  if (d == 0) r = -SLEEF_INFINITYf;

  return r;
}

EXPORT CONST float xlog2f_u35(float d) {
  float m, t, x, x2;
  int e;

  int o = d < FLT_MIN;
  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(d * (1.0f/0.75f));
  m = ldexp3kf(d, -e);

  if (o) e -= 64;

  x = (m - 1) / (m + 1);
  x2 = x * x;

  t = +0.4374088347e+0;
  t = mlaf(t, x2, +0.5764843822e+0);
  t = mlaf(t, x2, +0.9618024230e+0);

  float r = mlaf(x2 * x, t, mlaf(x, +0.2885390043e+1, e));
  
  if (xisinff(d)) r = SLEEF_INFINITYf;
  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;
  if (d == 0) r = -SLEEF_INFINITYf;

  return r;
}

EXPORT CONST float xlog1pf(float d) {
  Sleef_float2 x, s;
  float m, t, x2;
  int e;

  float dp1 = d + 1;
  
  int o = dp1 < FLT_MIN;
  if (o) dp1 *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);
      
  e = ilogb2kf(dp1 * (1.0f/0.75f));

  t = ldexp3kf(1, -e);
  m = mlaf(d, t, t-1);

  if (o) e -= 64;
  
  x = dfdiv_f2_f2_f2(df(m, 0), dfadd_f2_f_f(2, m));
  x2 = x.x * x.x;

  t = +0.3027294874e+0f;
  t = mlaf(t, x2, +0.3996108174e+0f);
  t = mlaf(t, x2, +0.6666694880e+0f);

  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e);
  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
  s = dfadd_f2_f2_f(s, x2 * x.x * t);

  float r = s.x + s.y;
    
  if (d > 1e+38) r = SLEEF_INFINITYf;
  if (d < -1) r = SLEEF_NANf;
  if (d == -1) r = -SLEEF_INFINITYf;
  if (xisnegzerof(d)) r = -0.0f;

  return r;
}

EXPORT CONST float xcbrtf(float d) {
  float x, y, q = 1.0f;
  int e, r;

  e = ilogbkf(fabsfk(d))+1;
  d = ldexp2kf(d, -e);
  r = (e + 6144) % 3;
  q = (r == 1) ? 1.2599210498948731647672106f : q;
  q = (r == 2) ? 1.5874010519681994747517056f : q;
  q = ldexp2kf(q, (e + 6144) / 3 - 2048);

  q = mulsignf(q, d);
  d = fabsfk(d);

  x = -0.601564466953277587890625f;
  x = mlaf(x, d, 2.8208892345428466796875f);
  x = mlaf(x, d, -5.532182216644287109375f);
  x = mlaf(x, d, 5.898262500762939453125f);
  x = mlaf(x, d, -3.8095417022705078125f);
  x = mlaf(x, d, 2.2241256237030029296875f);

  y = d * x * x;
  y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;

  return y;
}

EXPORT CONST float xcbrtf_u1(float d) {
  float x, y, z;
  Sleef_float2 q2 = df(1, 0), u, v;
  int e, r;

  e = ilogbkf(fabsfk(d))+1;
  d = ldexp2kf(d, -e);
  r = (e + 6144) % 3;
  q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2;
  q2 = (r == 2) ? df(1.5874010324478149414,  1.9520385308169352356e-08) : q2;

  q2.x = mulsignf(q2.x, d); q2.y = mulsignf(q2.y, d);
  d = fabsfk(d);

  x = -0.601564466953277587890625f;
  x = mlaf(x, d, 2.8208892345428466796875f);
  x = mlaf(x, d, -5.532182216644287109375f);
  x = mlaf(x, d, 5.898262500762939453125f);
  x = mlaf(x, d, -3.8095417022705078125f);
  x = mlaf(x, d, 2.2241256237030029296875f);

  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0f);

  z = x;

  u = dfmul_f2_f_f(x, x);
  u = dfmul_f2_f2_f2(u, u);
  u = dfmul_f2_f2_f(u, d);
  u = dfadd2_f2_f2_f(u, -x);
  y = u.x + u.y;

  y = -2.0 / 3.0 * y * z;
  v = dfadd2_f2_f2_f(dfmul_f2_f_f(z, z), y);
  v = dfmul_f2_f2_f(v, d);
  v = dfmul_f2_f2_f2(v, q2);
  z = ldexp2kf(v.x + v.y, (e + 6144) / 3 - 2048);

  if (xisinff(d)) { z = mulsignf(SLEEF_INFINITYf, q2.x); }
  if (d == 0) { z = mulsignf(0, q2.x); }

  return z;
}

//

EXPORT CONST float xfabsf(float x) { return fabsfk(x); }

EXPORT CONST float xcopysignf(float x, float y) { return copysignfk(x, y); }

EXPORT CONST float xfmaxf(float x, float y) {
  return y != y ? x : (x > y ? x : y);
}

EXPORT CONST float xfminf(float x, float y) {
  return y != y ? x : (x < y ? x : y);
}

EXPORT CONST float xfdimf(float x, float y) {
  float ret = x - y;
  if (ret < 0 || x == y) ret = 0;
  return ret;
}

EXPORT CONST float xtruncf(float x) {
  float fr = x - (int32_t)x;
  return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x);
}

EXPORT CONST float xfloorf(float x) {
  float fr = x - (int32_t)x;
  fr = fr < 0 ? fr+1.0f : fr;
  return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x);
}

EXPORT CONST float xceilf(float x) {
  float fr = x - (int32_t)x;
  fr = fr <= 0 ? fr : fr-1.0f;
  return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x);
}

EXPORT CONST float xroundf(float d) {
  float x = d + 0.5f;
  float fr = x - (int32_t)x;
  if (fr == 0 && x <= 0) x--;
  fr = fr < 0 ? fr+1.0f : fr;
  x = d == 0.4999999701976776123f ? 0 : x;  // nextafterf(0.5, 0)
  return (xisinff(d) || fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d);
}

EXPORT CONST float xrintf(float d) {
  float x = d + 0.5f;
  int32_t isodd = (1 & (int32_t)x) != 0;
  float fr = x - (int32_t)x;
  fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr;
  x = d == 0.50000005960464477539f ? 0 : x;  // nextafterf(0.5, 1)
  return (xisinff(d) || fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d);
}

EXPORT CONST Sleef_float2 xmodff(float x) {
  float fr = x - (int32_t)x;
  fr = fabsfk(x) > (float)(INT64_C(1) << 23) ? 0 : fr;
  Sleef_float2 ret = { copysignfk(fr, x), copysignfk(x - fr, x) };
  return ret;
}

EXPORT CONST float xldexpf(float x, int exp) {
  if (exp >  300) exp =  300;
  if (exp < -300) exp = -300;
  
  int e0 = exp >> 2;
  if (exp < 0) e0++;
  if (-50 < exp && exp < 50) e0 = 0;
  int e1 = exp - (e0 << 2);
  
  float p = pow2if(e0);
  float ret = x * pow2if(e1) * p * p * p * p;
  
  return ret;
}

EXPORT CONST float xnextafterf(float x, float y) {
  union {
    float f;
    int32_t i;
  } cx;

  cx.f = x == 0 ? mulsignf(0, y) : x;
  int c = (cx.i < 0) == (y < x);
  if (c) cx.i = -(cx.i ^ (1 << 31));

  if (x != y) cx.i--;

  if (c) cx.i = -(cx.i ^ (1 << 31));

  if (cx.f == 0 && x != 0) cx.f = mulsignf(0, x);
  if (x == 0 && y == 0) cx.f = y;
  if (xisnanf(x) || xisnanf(y)) cx.f = SLEEF_NANf;
  
  return cx.f;
}

EXPORT CONST float xfrfrexpf(float x) {
  union {
    float f;
    int32_t u;
  } cx;

  if (fabsfk(x) < FLT_MIN) x *= (1 << 30);
  
  cx.f = x;
  cx.u &= ~0x7f800000U;
  cx.u |=  0x3f000000U;

  if (xisinff(x)) cx.f = mulsignf(SLEEF_INFINITYf, x);
  if (x == 0) cx.f = x;
  
  return cx.f;
}

EXPORT CONST int xexpfrexpf(float x) {
  union {
    float f;
    uint32_t u;
  } cx;

  int ret = 0;
  
  if (fabsfk(x) < FLT_MIN) { x *= (1 << 30); ret = -30; }
  
  cx.f = x;
  ret += (int32_t)(((cx.u >> 23) & 0xff)) - 0x7e;

  if (x == 0 || xisnanf(x) || xisinff(x)) ret = 0;
  
  return ret;
}

EXPORT CONST float xhypotf_u05(float x, float y) {
  x = fabsfk(x);
  y = fabsfk(y);
  float min = fminfk(x, y), n = min;
  float max = fmaxfk(x, y), d = max;

  if (max < FLT_MIN) { n *= UINT64_C(1) << 24; d *= UINT64_C(1) << 24; }
  Sleef_float2 t = dfdiv_f2_f2_f2(df(n, 0), df(d, 0));
  t = dfmul_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(t), 1)), max);
  float ret = t.x + t.y;
  if (xisnanf(ret)) ret = SLEEF_INFINITYf;
  if (min == 0) ret = max;
  if (xisnanf(x) || xisnanf(y)) ret = SLEEF_NANf;
  if (x == SLEEF_INFINITYf || y == SLEEF_INFINITYf) ret = SLEEF_INFINITYf;
  return ret;
}

EXPORT CONST float xhypotf_u35(float x, float y) {
  x = fabsfk(x);
  y = fabsfk(y);
  float min = fminfk(x, y);
  float max = fmaxfk(x, y);
  
  float t = min / max;
  float ret = max * SQRTF(1 + t*t);
  if (min == 0) ret = max;
  if (xisnanf(x) || xisnanf(y)) ret = SLEEF_NANf;
  if (x == SLEEF_INFINITYf || y == SLEEF_INFINITYf) ret = SLEEF_INFINITYf;
  return ret;
}

static INLINE CONST float toward0f(float d) {
  return d == 0 ? 0 : intBitsToFloat(floatToRawIntBits(d)-1);
}

static INLINE CONST float ptruncf(float x) {
  return fabsfk(x) >= (float)(INT64_C(1) << 23) ? x : (x - (x - (int32_t)x));
}

EXPORT CONST float xfmodf(float x, float y) {
  float nu = fabsfk(x), de = fabsfk(y), s = 1, q;
  if (de < FLT_MIN) { nu *= UINT64_C(1) << 25; de *= UINT64_C(1) << 25; s = 1.0f / (UINT64_C(1) << 25); }
  Sleef_float2 r = df(nu, 0);
  float rde = toward0f(1.0f / de);

  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
    q = ptruncf(toward0f(r.x) * rde);
    q = (3*de > r.x && r.x >= de) ? 2 : q;
    q = (2*de > r.x && r.x >= de) ? 1 : q;
    r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -de)));
    if (r.x < de) break;
  }
  
  float ret = (r.x + r.y) * s;
  if (r.x + r.y == de) ret = 0;
  ret = mulsignf(ret, x);
  if (nu < de) ret = x;
  if (de == 0) ret = SLEEF_NANf;

  return ret;
}

static INLINE CONST float rintfk2(float d) {
  float x = d + 0.5f;
  int32_t isodd = (1 & (int32_t)x) != 0;
  float fr = x - (int32_t)x;
  fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr;
  return (fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d);
}

EXPORT CONST float xremainderf(float x, float y) {
  float n = fabsfk(x), d = fabsfk(y), s = 1, q;
  if (d < FLT_MIN*2) { n *= UINT64_C(1) << 25; d *= UINT64_C(1) << 25; s = 1.0f / (UINT64_C(1) << 25); }
  float rd = 1.0f / d;
  Sleef_float2 r = df(n, 0);
  int qisodd = 0;

  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
    q = rintfk2(r.x * rd);
    if (fabsfk(r.x) < 1.5f * d) q = r.x < 0 ? -1 : 1;
    if (fabsfk(r.x) < 0.5f * d || (fabsfk(r.x) == 0.5f * d && !qisodd)) q = 0;
    if (q == 0) break;
    if (xisinff(q * -d)) q = q + mulsignf(-1, r.x);
    qisodd ^= (1 & (int)q) != 0 && fabsfk(q) < (float)(INT64_C(1) << 24);
    r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -d)));
  }
  
  float ret = r.x * s;
  ret = mulsignf(ret, x);
  if (xisinff(y)) ret = xisinff(x) ? SLEEF_NANf : x;
  if (d == 0) ret = SLEEF_NANf;

  return ret;
}

EXPORT CONST float xsqrtf_u05(float d) {
  float q = 0.5f;

  d = d < 0 ? SLEEF_NANf : d;

  if (d < 5.2939559203393770e-23f) {
    d *= 1.8889465931478580e+22f;
    q = 7.2759576141834260e-12f * 0.5f;
  }

  if (d > 1.8446744073709552e+19f) {
    d *= 5.4210108624275220e-20f;
    q = 4294967296.0f * 0.5f;
  }
  
  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
  float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45f) >> 1));

  x = x * (1.5f - 0.5f * d * x * x);
  x = x * (1.5f - 0.5f * d * x * x);
  x = x * (1.5f - 0.5f * d * x * x) * d;

  Sleef_float2 d2 = dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(x, x)), dfrec_f2_f(x));

  float ret = (d2.x + d2.y) * q;

  ret = d == SLEEF_INFINITYf ? SLEEF_INFINITYf : ret;
  ret = d == 0 ? d : ret;

  return ret;
}

EXPORT CONST float xsqrtf_u35(float d) {
  float q = 1.0f;

  d = d < 0 ? SLEEF_NANf : d;

  if (d < 5.2939559203393770e-23f) {
    d *= 1.8889465931478580e+22f;
    q = 7.2759576141834260e-12f;
  }

  if (d > 1.8446744073709552e+19f) {
    d *= 5.4210108624275220e-20f;
    q = 4294967296.0f;
  }
  
  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
  float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45) >> 1));

  x = x * (1.5f - 0.5f * d * x * x);
  x = x * (1.5f - 0.5f * d * x * x);
  x = x * (1.5f - 0.5f * d * x * x);
  x = x * (1.5f - 0.5f * d * x * x);

  return d == SLEEF_INFINITYf ? SLEEF_INFINITYf : (x * d * q);
}

EXPORT CONST float xsqrtf(float d) { return SQRTF(d); }

EXPORT CONST float xfmaf(float x, float y, float z) {
  float h2 = x * y + z, q = 1;
  if (fabsfk(h2) < 1e-38f) {
    const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1;
    x *= c1;
    y *= c1;
    z *= c2;
    q = 1.0f / c2;
  }
  if (fabsfk(h2) > 1e+38f) {
    const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1;
    x *= 1.0 / c1;
    y *= 1.0 / c1;
    z *= 1.0 / c2;
    q = c2;
  }
  Sleef_float2 d = dfmul_f2_f_f(x, y);
  d = dfadd2_f2_f2_f(d, z);
  float ret = (x == 0 || y == 0) ? z : (d.x + d.y);
  if (xisinff(z) && !xisinff(x) && !xisnanf(x) && !xisinff(y) && !xisnanf(y)) h2 = z;
  return (xisinff(h2) || xisnanf(h2)) ? h2 : ret*q;
}

//

static INLINE CONST Sleef_float2 sinpifk(float d) {
  float u, s, t;
  Sleef_float2 x, s2;

  u = d * 4;
  int q = ceilfk(u) & ~1;
  int o = (q & 2) != 0;
  
  s = u - (float)q;
  t = s;
  s = s * s;
  s2 = dfmul_f2_f_f(t, t);
  
  //
  
  u = o ? -0.2430611801e-7f : +0.3093842054e-6f;
  u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f);
  u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f);
  x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) :
		     df(-0.080745510756969451904, -1.3373665339076936258e-09));
  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) :
		      df(0.78539818525314331055, -2.1857338617566484855e-08));

  x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0));
  x = o ? dfadd2_f2_f2_f(x, 1) : x;
  
  //

  if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; }

  return x;
}

EXPORT CONST float xsinpif_u05(float d) {
  Sleef_float2 x = sinpifk(d);
  float r = x.x + x.y;

  if (xisnegzerof(d)) r = -0.0;
  if (fabsfk(d) > TRIGRANGEMAX4f) r = 0; 
  if (xisinff(d)) r = SLEEF_NANf;

  return r;
}

static INLINE CONST Sleef_float2 cospifk(float d) {
  float u, s, t;
  Sleef_float2 x, s2;

  u = d * 4;
  int q = ceilfk(u) & ~1;
  int o = (q & 2) == 0;
  
  s = u - (float)q;
  t = s;
  s = s * s;
  s2 = dfmul_f2_f_f(t, t);
  
  //
  
  u = o ? -0.2430611801e-7f : +0.3093842054e-6f;
  u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f);
  u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f);
  x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) :
		     df(-0.080745510756969451904, -1.3373665339076936258e-09));
  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) :
		      df(0.78539818525314331055, -2.1857338617566484855e-08));

  x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0));
  x = o ? dfadd2_f2_f2_f(x, 1) : x;
  
  //

  if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; }

  return x;
}

EXPORT CONST float xcospif_u05(float d) {
  Sleef_float2 x = cospifk(d);
  float r = x.x + x.y;

  if (fabsfk(d) > TRIGRANGEMAX4f) r = 1;
  if (xisinff(d)) r = SLEEF_NANf;

  return r;
}

typedef struct {
  Sleef_float2 a, b;
} df2;

static CONST df2 gammafk(float a) {
  Sleef_float2 clc = df(0, 0), clln = df(1, 0), clld = df(1, 0), v = df(1, 0), x, y, z;
  float t, u;

  int otiny = fabsfk(a) < 1e-30f, oref = a < 0.5f;

  x = otiny ? df(0, 0) : (oref ? dfadd2_f2_f_f(1, -a) : df(a, 0));

  int o0 = (0.5f <= x.x && x.x <= 1.2), o2 = 2.3 < x.x;

  y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 1), x));
  y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 2), y));

  clln = (o2 && x.x <= 7) ? y : clln;

  x = (o2 && x.x <= 7) ? dfadd2_f2_f2_f(x, 3) : x;
  t = o2 ? (1.0 / x.x) : dfnormalize_f2_f2(dfadd2_f2_f2_f(x, o0 ? -1 : -2)).x;
  
  u = o2 ? +0.000839498720672087279971000786 : (o0 ? +0.9435157776e+0f : +0.1102489550e-3f);
  u = mlaf(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? +0.8670063615e+0f : +0.8160019934e-4f));
  u = mlaf(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.4826702476e+0f : +0.1528468856e-3f));
  u = mlaf(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.8855129778e-1f : -0.2355068718e-3f));
  u = mlaf(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1013825238e+0f : +0.4962242092e-3f));
  u = mlaf(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1493408978e+0f : -0.1193488017e-2f));
  u = mlaf(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1697509140e+0f : +0.2891599433e-2f));
  u = mlaf(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2072454542e+0f : -0.7385451812e-2f));
  u = mlaf(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705872357e+0f : +0.2058077045e-1f));

  y = dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, -0.5), logk2f(x));
  y = dfadd2_f2_f2_f2(y, dfneg_f2_f2(x));
  y = dfadd2_f2_f2_f2(y, dfx(0.91893853320467278056)); // 0.5*log(2*M_PI)

  z = dfadd2_f2_f2_f(dfmul_f2_f_f (u, t), o0 ? -0.400686534596170958447352690395e+0f : -0.673523028297382446749257758235e-1f);
  z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? +0.822466960142643054450325495997e+0f : +0.322467033928981157743538726901e+0f);
  z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? -0.577215665946766039837398973297e+0f : +0.422784335087484338986941629852e+0f);
  z = dfmul_f2_f2_f(z, t);

  clc = o2 ? y : z;
  
  clld = o2 ? dfadd2_f2_f2_f(dfmul_f2_f_f(u, t), 1) : clld;
  
  y = clln;

  clc = otiny ? dfx(41.58883083359671856503) : // log(2^60)
    (oref ? dfadd2_f2_f2_f2(dfx(1.1447298858494001639), dfneg_f2_f2(clc)) : clc); // log(M_PI)
  clln = otiny ? df(1, 0) : (oref ? clln : clld);

  if (oref) x = dfmul_f2_f2_f2(clld, sinpifk(a - (float)(INT64_C(1) << 12) * (int32_t)(a * (1.0 / (INT64_C(1) << 12)))));

  clld = otiny ? df(a*((INT64_C(1) << 30)*(float)(INT64_C(1) << 30)), 0) : (oref ? x : y);

  df2 ret = { clc, dfdiv_f2_f2_f2(clln, clld) };

  return ret;
}

EXPORT CONST float xtgammaf_u1(float a) {
  df2 d = gammafk(a);
  Sleef_float2 y = dfmul_f2_f2_f2(expk2f(d.a), d.b);
  float r = y.x + y.y;
  r = (a == -SLEEF_INFINITYf || (a < 0 && xisintf(a)) || (xisnumberf(a) && a < 0 && xisnanf(r))) ? SLEEF_NANf : r;
  r = ((a == SLEEF_INFINITYf || xisnumberf(a)) && a >= -FLT_MIN && (a == 0 || a > 36 || xisnanf(r))) ? mulsignf(SLEEF_INFINITYf, a) : r;
  return r;
}

EXPORT CONST float xlgammaf_u1(float a) {
  df2 d = gammafk(a);
  Sleef_float2 y = dfadd2_f2_f2_f2(d.a, logk2f(dfabs_f2_f2(d.b)));
  float r = y.x + y.y;
  r = (xisinff(a) || (a <= 0 && xisintf(a)) || (xisnumberf(a) && xisnanf(r))) ? SLEEF_INFINITYf : r;
  return r;
}

EXPORT CONST float xerff_u1(float a) {
  float s = a, t, u;
  Sleef_float2 d;

  a = fabsfk(a);
  int o0 = a < 1.1f, o1 = a < 2.4f, o2 = a < 4.0f;
  u = o0 ? (a*a) : a;
  
  t = o0 ? +0.7089292194e-4f : o1 ? -0.1792667899e-4f : -0.9495757695e-5f;
  t = mlaf(t, u, o0 ? -0.7768311189e-3f : o1 ? +0.3937633010e-3f : +0.2481465926e-3f);
  t = mlaf(t, u, o0 ? +0.5159463733e-2f : o1 ? -0.3949181177e-2f : -0.2918176819e-2f);
  t = mlaf(t, u, o0 ? -0.2683781274e-1f : o1 ? +0.2445474640e-1f : +0.2059706673e-1f);
  t = mlaf(t, u, o0 ? +0.1128318012e+0f : o1 ? -0.1070996150e+0f : -0.9901899844e-1f);
  d = dfmul_f2_f_f(t, u);
  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.376125876000657465175213237214e+0) :
		      o1 ? dfx(-0.634588905908410389971210809210e+0) :
		      dfx(-0.643598050547891613081201721633e+0));
  d = dfmul_f2_f2_f(d, u);
  d = dfadd2_f2_f2_f2(d, o0 ? dfx(+0.112837916021059138255978217023e+1) :
		      o1 ? dfx(-0.112879855826694507209862753992e+1) :
		      dfx(-0.112461487742845562801052956293e+1));
  d = dfmul_f2_f2_f(d, a);
  d = o0 ? d : dfadd_f2_f_f2(1.0, dfneg_f2_f2(expk2f(d)));
  u = mulsignf(o2 ? (d.x + d.y) : 1, s);
  u = xisnanf(a) ? SLEEF_NANf : u;
  return u;
}

EXPORT CONST float xerfcf_u15(float a) {
  float s = a, r = 0, t;
  Sleef_float2 u, d, x;
  a = fabsfk(a);
  int o0 = a < 1.0f, o1 = a < 2.2f, o2 = a < 4.3f, o3 = a < 10.1f;
  u = o1 ? df(a, 0) : dfdiv_f2_f2_f2(df(1, 0), df(a, 0));

  t = o0 ? -0.8638041618e-4f : o1 ? -0.6236977242e-5f : o2 ? -0.3869504035e+0f : +0.1115344167e+1f;
  t = mlaf(t, u.x, o0 ? +0.6000166177e-3f : o1 ? +0.5749821503e-4f : o2 ? +0.1288077235e+1f : -0.9454904199e+0f);
  t = mlaf(t, u.x, o0 ? -0.1665703603e-2f : o1 ? +0.6002851478e-5f : o2 ? -0.1816803217e+1f : -0.3667259514e+0f);
  t = mlaf(t, u.x, o0 ? +0.1795156277e-3f : o1 ? -0.2851036377e-2f : o2 ? +0.1249150872e+1f : +0.7155663371e+0f);
  t = mlaf(t, u.x, o0 ? +0.1914106123e-1f : o1 ? +0.2260518074e-1f : o2 ? -0.1328857988e+0f : -0.1262947265e-1f);
  
  d = dfmul_f2_f2_f(u, t);
  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.102775359343930288081655368891e+0) :
		      o1 ? dfx(-0.105247583459338632253369014063e+0) :
		      o2 ? dfx(-0.482365310333045318680618892669e+0) :
		      dfx(-0.498961546254537647970305302739e+0));
  d = dfmul_f2_f2_f2(d, u);
  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.636619483208481931303752546439e+0) :
		      o1 ? dfx(-0.635609463574589034216723775292e+0) :
		      o2 ? dfx(-0.134450203224533979217859332703e-2) :
		      dfx(-0.471199543422848492080722832666e-4));
  d = dfmul_f2_f2_f2(d, u);
  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.112837917790537404939545770596e+1) :
		      o1 ? dfx(-0.112855987376668622084547028949e+1) :
		      o2 ? dfx(-0.572319781150472949561786101080e+0) :
		      dfx(-0.572364030327966044425932623525e+0));

  x = dfmul_f2_f2_f(o1 ? d : df(-a, 0), a);
  x = o1 ? x : dfadd2_f2_f2_f2(x, d);

  x = expk2f(x);
  x = o1 ? x : dfmul_f2_f2_f2(x, u);

  r = o3 ? (x.x + x.y) : 0;
  if (s < 0) r = 2 - r;
  r = xisnanf(s) ? SLEEF_NANf : r;
  return r;
}

//

#ifdef ENABLE_MAIN
// gcc -w -DENABLE_MAIN -I../common sleefsp.c rempitab.c -lm
#include <stdlib.h>
int main(int argc, char **argv) {
  float d1 = atof(argv[1]);
  //float d2 = atof(argv[2]);
  //float d3 = atof(argv[3]);
  //printf("%.20g, %.20g\n", (double)d1, (double)d2);
  //float i2 = atoi(argv[2]);
  //float c = xatan2f_u1(d1, d2);
  //printf("round %.20g\n", (double)d1);
  printf("test    = %.20g\n", (double)xsqrtf_u05(d1));
  //printf("correct = %.20g\n", (double)roundf(d1));
  //printf("rint %.20g\n", (double)d1);
  //printf("test    = %.20g\n", (double)xrintf(d1));
  //printf("correct = %.20g\n", (double)rintf(d1));
  //Sleef_float2 r = xsincospif_u35(d);
  //printf("%g, %g\n", (double)r.x, (double)r.y);
}
#endif


================================================
FILE: src/ufp.cpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>

// ----------------------------------------------------------------------------
// Actual implementation

namespace nsimd {

template <int ExponentSize, int MantissaSize, typename UnsignedType,
          typename T>
int ufp(T a_, T b_) {
  UnsignedType a = nsimd::scalar_reinterpret(UnsignedType(), a_);
  UnsignedType b = nsimd::scalar_reinterpret(UnsignedType(), b_);
  UnsignedType exp_mask = ((UnsignedType)1 << ExponentSize) - 1;
  i64 ea = (i64)((a >> MantissaSize) & exp_mask);
  i64 eb = (i64)((b >> MantissaSize) & exp_mask);
  if (ea - eb > 1 || ea - eb < -1) {
    return 0;
  }
  UnsignedType man_mask = ((UnsignedType)1 << MantissaSize) - 1;
  i64 ma = (i64)(a & man_mask) | ((i64)1 << MantissaSize);
  i64 mb = (i64)(b & man_mask) | ((i64)1 << MantissaSize);
  i64 d = 0;

  if (ea == eb) {
    d = ma - mb;
  } else if (ea > eb) {
    d = 2 * ma - mb;
  } else {
    d = 2 * mb - ma;
  }
  d = (d >= 0 ? d : -d);
  int i = 0;
  for (; i <= MantissaSize + 1 && d >= ((i64)1 << i); i++)
    ;
  return (int)(MantissaSize + 1 - i);
}

} // namespace nsimd

// ----------------------------------------------------------------------------
// C ABI

extern "C" {

NSIMD_DLLSPEC int nsimd_ufp_f16(f16 a, f16 b) {
  return nsimd::ufp<5, 10, u16>(a, b);
}

NSIMD_DLLSPEC int nsimd_ufp_f32(f32 a, f32 b) {
  return nsimd::ufp<8, 23, u32>(a, b);
}

NSIMD_DLLSPEC int nsimd_ufp_f64(f64 a, f64 b) {
  return nsimd::ufp<11, 52, u64>(a, b);
}

} // extern "C"


================================================
FILE: tests/CMakeLists.txt.sh
================================================
# MIT License
#
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

set -e
set -x

BUF="`dirname $0`/.."
NSIMD_CMAKE=`realpath ${BUF}`

for simd_ext in "$@"; do

  # Take care of cross compilation here
  case ${simd_ext} in
    aarch64 | sve | sve128 | sve256 | sve512 | sve1024 | sve2048)
      C_COMP="aarch64-linux-gnu-gcc"
      CXX_COMP="aarch64-linux-gnu-g++"
      ;;
    neon128)
      C_COMP="arm-linux-gnueabi-gcc"
      CXX_COMP="arm-linux-gnueabi-g++"
      ;;
    vmx | vsx)
      C_COMP="${NSIMD_CMAKE}/scripts/powerpc64le-linux-gnu-clang.sh"
      CXX_COMP="${NSIMD_CMAKE}/scripts/powerpc64le-linux-gnu-clang++.sh"
      ;;
    oneapi)
      C_COMP="gcc"
      CXX_COMP="dpcpp"
      ;;
    rocm)
      C_COMP="gcc"
      CXX_COMP="${NSIMD_CMAKE}/scripts/hipcc.sh"
      ;;
    cuda)
      C_COMP="gcc"
      CXX_COMP="nvcc"
      ;;
    *)
      C_COMP="gcc"
      CXX_COMP="g++"
      ;;
  esac

  # First case: find a specific component
  ROOT_DIR="${PWD}/nsimd_cmake_tests/${simd_ext}"
  rm -rf ${ROOT_DIR}
  mkdir -p ${ROOT_DIR}
  (cd ${ROOT_DIR} && \
   cmake ${NSIMD_CMAKE} \
         -Dsimd=${simd_ext} \
         -DCMAKE_INSTALL_PREFIX=${ROOT_DIR}/root \
         -DCMAKE_C_COMPILER="${C_COMP}" \
         -DCMAKE_CXX_COMPILER="${CXX_COMP}" && \
   make VERBOSE=1 && \
   make install)

done


================================================
FILE: tests/FindNSIMD.cmake.sh
================================================
#!/bin/bash
#
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

set -e
set -x

FIND_NSIMD_CMAKE="`dirname $0`/../scripts/FindNSIMD.cmake"
SIMD_EXTS="sse2 sse42 avx avx2 avx512_knl avx512_skylake neon128 aarch64 \
           sve sve128 sve256 sve512 sve1024 sve2048 cuda rocm"

for simd_ext in ${SIMD_EXTS}; do

  # First case: find a specific component
  ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}"
  rm -rf ${ROOT_DIR}
  mkdir -p "${ROOT_DIR}/cmake"
  cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake"
  mkdir -p "${ROOT_DIR}/root/include/nsimd"
  touch "${ROOT_DIR}/root/include/nsimd/nsimd.h"
  mkdir -p "${ROOT_DIR}/root/lib"
  touch "${ROOT_DIR}/root/lib/libnsimd_${simd_ext}.so"

  cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF
	cmake_minimum_required(VERSION 3.0.0)
	project(FIND_NSIMD_CMAKE_TESTS)
	set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake")
	set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root")
	find_package(NSIMD COMPONENTS ${simd_ext})
	message(STATUS "FindNSIMD.cmake test : specific for ${simd_ext}")
	message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}")
	if (\${NSIMD_FOUND})
	  message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}")
	  message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}")
	  message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}")
	else()
	  message(FATAL_ERROR "error NSIMD_FOUND should be TRUE")
	endif()
	EOF
  (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..)

  # Second case: find a automatically a component
  ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}-auto"
  rm -rf ${ROOT_DIR}
  mkdir -p "${ROOT_DIR}/cmake"
  cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake"
  mkdir -p "${ROOT_DIR}/root/include/nsimd"
  touch "${ROOT_DIR}/root/include/nsimd/nsimd.h"
  mkdir -p "${ROOT_DIR}/root/lib"
  touch "${ROOT_DIR}/root/lib/libnsimd_${simd_ext}.so"

  cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF
	cmake_minimum_required(VERSION 3.0.0)
	project(FIND_NSIMD_CMAKE_TESTS)
	set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake")
	set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root")
	find_package(NSIMD)
	message(STATUS "FindNSIMD.cmake test : automatic for ${simd_ext}")
	message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}")
	if (\${NSIMD_FOUND})
	  message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}")
	  message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}")
	  message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}")
	else()
	  message(FATAL_ERROR "error NSIMD_FOUND should be TRUE")
	endif()
	EOF
  (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..)

  # Third case: find a specific component
  ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}-notfound"
  rm -rf ${ROOT_DIR}
  mkdir -p "${ROOT_DIR}/cmake"
  cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake"
  mkdir -p "${ROOT_DIR}/root/include/nsimd"
  touch "${ROOT_DIR}/root/include/nsimd/nsimd.h"
  mkdir -p "${ROOT_DIR}/root/lib"
  touch "${ROOT_DIR}/root/lib/libnsimd_cpu.so"

  cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF
	cmake_minimum_required(VERSION 3.0.0)
	project(FIND_NSIMD_CMAKE_TESTS)
	set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake")
	set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root")
	find_package(NSIMD COMPONENTS ${simd_ext})
	message(STATUS "FindNSIMD.cmake test : "
	               "notfound specific for ${simd_ext}")
	message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}")
	if (\${NSIMD_FOUND})
	  message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}")
	  message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}")
	  message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}")
	  message(FATAL_ERROR "error NSIMD_FOUND should be FALSE")
	else()
	  message(STATUS "NSIMD not found")
	endif()
	EOF
  (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..)

  # Fourth case: find a automatically a component
  ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}-auto-notfound"
  rm -rf ${ROOT_DIR}
  mkdir -p "${ROOT_DIR}/cmake"
  cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake"
  mkdir -p "${ROOT_DIR}/root/include/nsimd"
  touch "${ROOT_DIR}/root/include/nsimd/nsimd.h"
  mkdir -p "${ROOT_DIR}/root/lib"

  cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF
	cmake_minimum_required(VERSION 3.0.0)
	project(FIND_NSIMD_CMAKE_TESTS)
	set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake")
	set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root")
	find_package(NSIMD)
	message(STATUS "FindNSIMD.cmake test : "
	               "notfound automatic for ${simd_ext}")
	message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}")
	if (\${NSIMD_FOUND})
	  message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}")
	  message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}")
	  message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}")
	  message(FATAL_ERROR "error NSIMD_FOUND should be FALSE")
	else()
	  message(STATUS "NSIMD not found")
	endif()
	EOF
  (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..)

done


================================================
FILE: tests/allocator.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>

#include <cstdlib>
#include <vector>

int main() {
  std::vector<float, nsimd::allocator<float> > v;

  v.clear();
  v.resize(100);

  v.clear();
  v.resize(100);
  v.resize(10000);

  v.clear();
  v.reserve(30);

  for (int i = 0; i < 1000; i++) {
    v.push_back(float(i));
  }
  if (v.size() != 1000) {
    exit(EXIT_FAILURE);
  }

  for (int i = 0; i < 500; i++) {
    v.pop_back();
  }
  if (v.size() != 500) {
    exit(EXIT_FAILURE);
  }

  return 0;
}


================================================
FILE: tests/assign_arith.cpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>
#include <iostream>

/* ------------------------------------------------------------------------- */
/* Random number */

template <typename T> T get_rand() {
  return (T)((rand() % 10) + 1);
}

template <> f16 get_rand() {
  return nsimd_f32_to_f16(get_rand<f32>());
}

/* ------------------------------------------------------------------------- */
/* Arithmetic operators */

#define HELPER(op1, op2, name)                                                \
  template <typename T> int test_##name##_T(size_t n) {                       \
    std::vector<T> a(n), b(n);                                                \
    for (size_t i = 0; i < n; i++) {                                          \
      a[i] = get_rand<T>();                                                   \
      b[i] = get_rand<T>();                                                   \
    }                                                                         \
                                                                              \
    using namespace nsimd;                                                    \
    typedef pack<T> pack;                                                     \
    for (size_t i = 0; i < n; i += size_t(len(pack()))) {                     \
      pack tmp1 = loadu<pack>(&a[i]);                                         \
      tmp1 op1 loadu<pack>(&b[i]);                                            \
      pack tmp2 = loadu<pack>(&a[i]) op2 loadu<pack>(&b[i]);                  \
      if (any(tmp1 != tmp2)) {                                                \
        return -1;                                                            \
      }                                                                       \
    }                                                                         \
    return 0;                                                                 \
  }                                                                           \
                                                                              \
  int test_##name(size_t n) {                                                 \
    return test_##name##_T<i8>(n) || test_##name##_T<u8>(n) ||                \
           test_##name##_T<i16>(n) || test_##name##_T<u16>(n) ||              \
           test_##name##_T<f16>(n) || test_##name##_T<i32>(n) ||              \
           test_##name##_T<u32>(n) || test_##name##_T<f32>(n) ||              \
           test_##name##_T<i64>(n) || test_##name##_T<u64>(n) ||              \
           test_##name##_T<f64>(n);                                           \
  }                                                                           \
                                                                              \
  int test_##name##_int_only(size_t n) {                                      \
    return test_##name##_T<i8>(n) || test_##name##_T<u8>(n) ||                \
           test_##name##_T<i16>(n) || test_##name##_T<u16>(n) ||              \
           test_##name##_T<i32>(n) || test_##name##_T<u32>(n) ||              \
           test_##name##_T<i64>(n) || test_##name##_T<u64>(n);                \
  }

HELPER(+=, +, add)
HELPER(-=, -, sub)
HELPER(*=, *, mul)
HELPER(/=, /, div)
HELPER(|=, |, orb)
HELPER(&=, &, andb)
HELPER(^=, ^, xorb)

#undef HELPER

/* ------------------------------------------------------------------------- */
/* Shift operators */

#define HELPER(op1, op2, name)                                                \
  template <typename T> int test_##name##_T(size_t n) {                       \
    std::vector<T> a(n);                                                      \
    for (size_t i = 0; i < n; i++) {                                          \
      a[i] = get_rand<T>();                                                   \
    }                                                                         \
                                                                              \
    using namespace nsimd;                                                    \
    typedef pack<T> pack;                                                     \
    for (int s = 0; s <= 3; s++) {                                            \
      for (size_t i = 0; i < n; i += size_t(len(pack()))) {                   \
        pack tmp = loadu<pack>(&a[i]);                                        \
        tmp op1 s;                                                            \
        if (any(tmp != (loadu<pack>(&a[i]) op2 s))) {                         \
          return -1;                                                          \
        }                                                                     \
      }                                                                       \
    }                                                                         \
    return 0;                                                                 \
  }                                                                           \
                                                                              \
  int test_##name(size_t n) {                                                 \
    return test_##name##_T<i8>(n) || test_##name##_T<u8>(n) ||                \
           test_##name##_T<i16>(n) || test_##name##_T<u16>(n) ||              \
           test_##name##_T<i32>(n) || test_##name##_T<u32>(n) ||              \
           test_##name##_T<i64>(n) || test_##name##_T<u64>(n);                \
  }

HELPER(<<=, <<, shl)
HELPER(>>=, >>, shr)

#undef HELPER

/* ------------------------------------------------------------------------- */

int main() {
  const size_t n = 2048;
  return test_add(n) || test_sub(n) || test_mul(n) || test_div(n) ||
         test_orb_int_only(n) || test_andb_int_only(n) ||
         test_xorb_int_only(n) || test_shl(n) || test_shr(n);
}


================================================
FILE: tests/booleans.cpp
================================================
/*

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>
#include <cstdlib>

// ----------------------------------------------------------------------------

int main() {
  using namespace nsimd;
  packl<int> v = packl<int>(true) || packl<float>(false);
  if (!all(v)) {
    return -1;
  }
  return 0;
}


================================================
FILE: tests/c11_vec.c
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.h>

int main() {
#if NSIMD_C >= 2011
  float in[NSIMD_MAX_LEN(f32)];
  int out[NSIMD_MAX_LEN(i32)];

  nsimd_pack(f32) vin = nsimd_load(unaligned, nsimd_pack(f32), in);
  nsimd_pack(i32) vout = nsimd_reinterpret(nsimd_pack(i32), vin);
  nsimd_store(unaligned, out, vout);
#endif

  return 0;
}


================================================
FILE: tests/cxx_adv_api_aliases.cpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/cxx_adv_api_aliases.hpp>

/* ------------------------------------------------------------------------- */
/* Random number */

template <typename T> T get_rand() {
  return (T)((rand() % 100) - 50);
}

template <> f16 get_rand() {
  return nsimd_f32_to_f16(get_rand<f32>());
}

/* ------------------------------------------------------------------------- */

template <typename T> int test_aliases(size_t n) {
  std::vector<T> a(n), b(n);

  for (size_t i = 0; i < n; i++) {
    a[i] = get_rand<T>();
    b[i] = get_rand<T>();
  }

  using namespace nsimd;
  typedef pack<T> pack;
  size_t step = size_t(len(pack()));
  for (size_t i = 0; i + step <= n; i += step) {
    pack tmp1 = loadu<pack>(&a[i]);
    pack tmp2 = loadu<pack>(&b[i]);
    if (any(fabs(tmp1) != abs(tmp1))) {
      return -1;
    }
    if (any(fmin(tmp1, tmp2) != min(tmp1, tmp2))) {
      return -1;
    }
    if (any(fmax(tmp1, tmp2) != max(tmp1, tmp2))) {
      return -1;
    }
  }

  return 0;
}

/* ------------------------------------------------------------------------- */

int main() {
  return test_aliases<i8>(2048) || test_aliases<u8>(2048) ||
         test_aliases<i16>(2048) || test_aliases<u16>(2048) ||
         test_aliases<f16>(2048) || test_aliases<i32>(2048) ||
         test_aliases<u32>(2048) || test_aliases<f32>(2048) ||
         test_aliases<i64>(2048) || test_aliases<u64>(2048) ||
         test_aliases<f64>(2048);
}


================================================
FILE: tests/fp16.prec11.c
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#define _POSIX_C_SOURCE 200112L

#include <math.h>
#include <nsimd/nsimd.h>
#include <stdio.h>
#include <stdlib.h>

/* ------------------------------------------------------------------------- */

float via_fp16(float a) { return nsimd_f16_to_f32(nsimd_f32_to_f16(a)); }

/* ------------------------------------------------------------------------- */

float mk_fp32(int mantissa, int exponent) {
  return (float)ldexp((double)mantissa, exponent);
}

/* ------------------------------------------------------------------------- */

int test_f16_to_f32(u16 val, u32 expected) {
  f32 fexpected = nsimd_scalar_reinterpret_f32_u32(expected);
  f32 res = nsimd_u16_to_f32(val);
  u32 ures = nsimd_scalar_reinterpret_u32_f32(res);
  if ((nsimd_isnan_f32(fexpected) && !nsimd_isnan_f32(res)) ||
      (!nsimd_isnan_f32(fexpected) && ures != expected)) {
    fprintf(stdout,
            "Error, nsimd_f16_to_f32: expected %e(0x%x) but got %e(0x%x) \n",
            (f64)fexpected, expected, (f64)res, ures);
    fflush(stdout);
    return 1;
  }

  return 0;
}

/* ------------------------------------------------------------------------- */

int test_f32_to_f16(u32 val, u16 expected) {
  f16 fres = nsimd_f32_to_f16(nsimd_scalar_reinterpret_f32_u32(val));
  u16 ures = nsimd_scalar_reinterpret_u16_f16(fres);
  if (ures != expected) {
    fprintf(stdout, "Error, nsimd_f32_to_f16: expected 0x%x but got 0x%x \n",
            expected, ures);
    fflush(stdout);
    return 1;
  }

  return 0;
}

/* ------------------------------------------------------------------------- */

int main(void) {
#ifndef NSIMD_NO_IEEE754
  const float infty = nsimd_scalar_reinterpret_f32_u32(0x7F800000);
  const float m_infty = nsimd_scalar_reinterpret_f32_u32(0xFF800000);
  const float nan = nsimd_scalar_reinterpret_f32_u32(0x7FC00000);
#endif
  int i;

  /* Some corner cases first. */
  if (test_f16_to_f32(0x0000, 0x0)) {
    return EXIT_FAILURE;
  }
  if (test_f16_to_f32(0x8000, 0x80000000)) {
    return EXIT_FAILURE;
  }
  if (test_f16_to_f32(0x3C00, 0x3f800000)) {
    return EXIT_FAILURE;
  }
  if (test_f16_to_f32(0x13e, 0x379F0000)) { /* 1.8954277E-5 */
    return EXIT_FAILURE;
  }
  if (test_f16_to_f32(0x977e, 0xBAEFC000)) { /* -1.8291473E-3 */
    return EXIT_FAILURE;
  }

  if (test_f32_to_f16(0xC7BDC4FC, 0xFC00)) { /* -97161.97 */
    return EXIT_FAILURE;
  }

  if (test_f32_to_f16(0x37c3642c, 0x187)) { /* 2.329246e-05 */
    return EXIT_FAILURE;
  }

  if (test_f32_to_f16(0xb314e840, 0x8001)) {
    return EXIT_FAILURE;
  }

  /* Test rounding when the input f32 is perfectly between 2 f16 */
  if (test_f32_to_f16(0xC66AD000, 0xf356)) {
    return EXIT_FAILURE;
  }

  /* Close to ±Inf */
  if (test_f32_to_f16(0x477fefff, 0x7bff)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0x477ff000, 0x7c00)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0xC77fefff, 0xfbff)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0xC77ff000, 0xfc00)) {
    return EXIT_FAILURE;
  }

  /* Close to ±0 */
  if (test_f32_to_f16(0x33000001, 0x0001)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0x33000000, 0x0000)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0xB3000001, 0x8001)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0xB3000000, 0x8000)) {
    return EXIT_FAILURE;
  }

  /* Close to the denormal limit */
  if (test_f32_to_f16(0x38800000, 0x0400)) {
    return EXIT_FAILURE;
  }
  if (test_f32_to_f16(0x387fffff, 0x0400)) {
    return EXIT_FAILURE;
  }

  /* NaN special value (Copy Intel intrinsics which set the MSB of the mantissa
   * of NaNs to 1 when converting f16 to f32). */
  if (test_f16_to_f32(0xfcf8, 0xff9f0000)) {
    return EXIT_FAILURE;
  }

#ifndef NSIMD_NO_IEEE754
  if (via_fp16(mk_fp32(1, 20)) != infty) {
    fprintf(stdout, "... Error, %i \n", __LINE__);
    fflush(stdout);
    return EXIT_FAILURE;
  }
  if (via_fp16(mk_fp32(-1, 20)) != m_infty) {
    fprintf(stdout, "... Error, %i \n", __LINE__);
    fflush(stdout);
    return EXIT_FAILURE;
  }
  if (!nsimd_isnan_f32(via_fp16(nan))) {
    fprintf(stdout, "... Error, %i \n", __LINE__);
    fflush(stdout);
    return EXIT_FAILURE;
  }
#endif

  /* Some random inputs */
  for (i = 0; i < 100; i++) {
    float a = (float)rand() / (float)RAND_MAX;
    if (fabsf(a - via_fp16(a)) > ldexpf(1.0, -9)) {
      return EXIT_FAILURE;
    }
  }

  fprintf(stdout, "... OK\n");
  fflush(stdout);
  return EXIT_SUCCESS;
}


================================================
FILE: tests/get_pack.cpp
================================================
/*

Copyright (c) 2020 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#define STATUS "test of get_pack over all types"

#include "tests_helpers.hpp"

// ----------------------------------------------------------------------------
// Little helper for scope memory

// ----------------------------------------------------------------------------

template <typename T> bool get_pack_from_pack_N_1() {

  LOG_TEST_DEBUG("get_pack_from_pack_N_1", T);

  nsimd::pack<T, 1> pack_1(42);
  nsimd::pack<T, 1> v0_get = nsimd::get_pack<0>(pack_1);

  nsimd::scoped_aligned_mem_for<T> expected(NSIMD_MAX_LEN_BIT / 8);
  nsimd::scoped_aligned_mem_for<T> computed(NSIMD_MAX_LEN_BIT / 8);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_1, v0_get, "nsimd::pack<T, 1>", "nsimd::pack<T, 1>", expected.get(),
      computed.get());
}

// ----------------------------------------------------------------------------

template <typename T> bool get_pack_from_packx2_N_3() {

  LOG_TEST_DEBUG("get_pack_from_packx2_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);

  nsimd::packx2<T, 3> packx2_3;
  packx2_3.v0 = v0;
  packx2_3.v1 = v1;

  nsimd::scoped_aligned_mem_for<T> expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T> computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  nsimd::pack<T, 3> v0_get = nsimd::get_pack<0>(packx2_3);
  if (!nsimd_tests::check_pack_expected_vs_computed(
          v0, v0_get, "nsimd::packx2<T, 3>.v0", "nsimd::pack<T, 3>",
          expected.get(), computed.get())) {
    return false;
  }

  nsimd::pack<T, 3> v1_get = nsimd::get_pack<1>(packx2_3);
  return nsimd_tests::check_pack_expected_vs_computed(
      v1, v1_get, "nsimd::packx2<T, 3>.v1", "nsimd::pack<T, 3>",
      expected.get(), computed.get());
}

// ----------------------------------------------------------------------------

template <typename T> bool get_pack_from_packx3_N_3() {

  LOG_TEST_DEBUG("get_pack_from_packx3_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);
  nsimd::pack<T, 3> v2(66);

  nsimd::packx3<T, 3> packx3_3;

  packx3_3.v0 = v0;
  packx3_3.v1 = v1;
  packx3_3.v2 = v2;

  nsimd::scoped_aligned_mem_for<T> expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T> computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  nsimd::pack<T, 3> v0_get = nsimd::get_pack<0>(packx3_3);
  if (!nsimd_tests::check_pack_expected_vs_computed(
          v0, v0_get, "nsimd::packx3<T, 3>.v0", "nsimd::pack<T, 3>",
          expected.get(), computed.get())) {
    return false;
  }

  nsimd::pack<T, 3> v1_get = nsimd::get_pack<1>(packx3_3);
  if (!nsimd_tests::check_pack_expected_vs_computed(
          v1, v1_get, "nsimd::packx3<T, 3>.v1", "nsimd::pack<T, 3>",
          expected.get(), computed.get())) {
    return false;
  }

  nsimd::pack<T, 3> v2_get = nsimd::get_pack<2>(packx3_3);
  return nsimd_tests::check_pack_expected_vs_computed(
      v2, v2_get, "nsimd::packx3<T, 3>.v2", "nsimd::pack<T, 3>",
      expected.get(), computed.get());
}

// ----------------------------------------------------------------------------

template <typename T> bool get_pack_from_packx4_N_3() {

  LOG_TEST_DEBUG("get_pack_from_packx4_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);
  nsimd::pack<T, 3> v2(66);
  nsimd::pack<T, 3> v3(90);

  nsimd::packx4<T, 3> packx4_3;

  packx4_3.v0 = v0;
  packx4_3.v1 = v1;
  packx4_3.v2 = v2;
  packx4_3.v3 = v3;

  nsimd::scoped_aligned_mem_for<T> expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T> computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  nsimd::pack<T, 3> v0_get = nsimd::get_pack<0>(packx4_3);
  if (!nsimd_tests::check_pack_expected_vs_computed(
          v0, v0_get, "nsimd::packx4<T, 3>.v0", "nsimd::pack<T, 3>",
          expected.get(), computed.get())) {
    return false;
  }

  nsimd::pack<T, 3> v1_get = nsimd::get_pack<1>(packx4_3);
  if (!nsimd_tests::check_pack_expected_vs_computed(
          v1, v1_get, "nsimd::packx4<T, 3>.v1", "nsimd::pack<T, 3>",
          expected.get(), computed.get())) {
    return false;
  }

  nsimd::pack<T, 3> v2_get = nsimd::get_pack<2>(packx4_3);
  if (!nsimd_tests::check_pack_expected_vs_computed(
          v2, v2_get, "nsimd::packx4<T, 3>.v2", "nsimd::pack<T, 3>",
          expected.get(), computed.get())) {
    return false;
  }

  nsimd::pack<T, 3> v3_get = nsimd::get_pack<3>(packx4_3);
  return nsimd_tests::check_pack_expected_vs_computed(
      v3, v3_get, "nsimd::packx4<T, 3>.v3", "nsimd::pack<T, 3>",
      expected.get(), computed.get());
}

// ----------------------------------------------------------------------------

template <typename T> bool test_all() {
  if (!get_pack_from_pack_N_1<T>()) {
    return 0;
  }
  if (!get_pack_from_packx2_N_3<T>()) {
    return 0;
  }
  if (!get_pack_from_packx3_N_3<T>()) {
    return 0;
  }
  if (!get_pack_from_packx4_N_3<T>()) {
    return 0;
  }
  return 1;
}

// ----------------------------------------------------------------------------

int main(void) {

  if (!test_all<i8>() || !test_all<u8>() || !test_all<i16>() ||
      !test_all<u16>() || !test_all<i32>() || !test_all<u32>() ||
      !test_all<i64>() || !test_all<u64>() || !test_all<f32>() ||
      !test_all<f64>()) {
    return -1;
  }

  fprintf(stdout, STATUS "... OK\n");
  fflush(stdout);
  return 0;
}


================================================
FILE: tests/memory.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>

#include <vector>
#include <cstdlib>

int test_aligned_alloc() {
  void *ptr = nsimd_aligned_alloc(17);
  if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) {
    return EXIT_FAILURE;
  }
  nsimd_aligned_free(ptr);
  return EXIT_SUCCESS;
}

template <typename T>
int test_aligned_alloc_for() {
  void *ptr = nsimd::aligned_alloc(17);
  if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) {
    return EXIT_FAILURE;
  }
  nsimd::aligned_free(ptr);
  return EXIT_SUCCESS;
}

template <typename T>
int test_allocator_for() {
  std::vector< T, nsimd::allocator<T> > v(17);
  if (v.size() != 17 || ((size_t)v.data() % NSIMD_MAX_ALIGNMENT) != 0) {
    return EXIT_FAILURE;
  }

  v.resize(17017);
  if (v.size() != 17017 || ((size_t)v.data() % NSIMD_MAX_ALIGNMENT) != 0) {
    return EXIT_FAILURE;
  }

  v.clear();
  if (v.size() != 0) {
    return EXIT_FAILURE;
  }

  return EXIT_SUCCESS;
}

int main() {
  return test_aligned_alloc() ||
         test_aligned_alloc_for<i8>() ||
         test_aligned_alloc_for<u8>() ||
         test_aligned_alloc_for<i16>() ||
         test_aligned_alloc_for<u16>() ||
         test_aligned_alloc_for<i32>() ||
         test_aligned_alloc_for<u32>() ||
         test_aligned_alloc_for<i64>() ||
         test_aligned_alloc_for<u64>() ||
         test_aligned_alloc_for<f32>() ||
         test_aligned_alloc_for<f64>() ||
         test_allocator_for<i8>() ||
         test_allocator_for<u8>() ||
         test_allocator_for<i16>() ||
         test_allocator_for<u16>() ||
         test_allocator_for<i32>() ||
         test_allocator_for<u32>() ||
         test_allocator_for<i64>() ||
         test_allocator_for<u64>() ||
         test_allocator_for<f32>() ||
         test_allocator_for<f64>();
}


================================================
FILE: tests/memory.prec11.c
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>
#include <stdlib.h>

int main(void) {
  void *ptr = nsimd_aligned_alloc(17);
  if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) {
    return EXIT_FAILURE;
  }
  nsimd_aligned_free(ptr);
  return EXIT_SUCCESS;
}


================================================
FILE: tests/modules/common.hpp
================================================
/*

Copyright (c) 2021 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef NSIMD_MODULES_SPMD_COMMON_HPP
#define NSIMD_MODULES_SPMD_COMMON_HPP

#include <nsimd/nsimd.h>
#include <nsimd/scalar_utilities.h>
#include <iostream>
#include <cstring>
#include <cerrno>
#include <cstdlib>

// ----------------------------------------------------------------------------
// Common code for devices

#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)

template <typename T> __device__ bool cmp_Ts(T a, T b) { return a == b; }

__device__ bool cmp_Ts(__half a, __half b) {
  return __half_as_short(a) == __half_as_short(b);
}

__device__ bool cmp_Ts(float a, float b) {
  return __float_as_int(a) == __float_as_int(b);
}

__device__ bool cmp_Ts(double a, double b) {
  return __double_as_longlong(a) == __double_as_longlong(b);
}

#elif defined(NSIMD_ONEAPI)

template <typename T> bool cmp_Ts(const T a, const T b) { return a == b; }

bool cmp_Ts(sycl::half a, const sycl::half b) {
  return nsimd::gpu_reinterpret(u16(), a) ==
         nsimd::gpu_reinterpret(u16(), b);
}

bool cmp_Ts(sycl::cl_float a, sycl::cl_float b) {
  return nsimd::gpu_reinterpret(u32(), a) ==
         nsimd::gpu_reinterpret(u32(), b);
}

bool cmp_Ts(sycl::cl_double a, sycl::cl_double b) {
  return nsimd::gpu_reinterpret(u64(), a) ==
         nsimd::gpu_reinterpret(u64(), b);
}

#endif

// ----------------------------------------------------------------------------
// CUDA

#if defined(NSIMD_CUDA)

// perform reduction on blocks first, note that this could be optimized
// but to check correctness we don't need it now
template <typename T>
__global__ void device_cmp_blocks(T *src1, T *src2, int n) {
  extern __shared__ char buf_[]; // size of a block
  T *buf = (T*)buf_;
  int tid = threadIdx.x;
  int i = tid + blockIdx.x * blockDim.x;
  if (i < n) {
    buf[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0);
  }

  const int block_start = blockIdx.x * blockDim.x;
  const int block_end = block_start + blockDim.x;

  int size;
  if (block_end < n) {
    size = blockDim.x;
  } else {
    size = n - block_start;
  }

  __syncthreads();
  for (int s = size / 2; s != 0; s /= 2) {
    if (tid < s && i < n) {
      buf[tid] = nsimd::gpu_mul(buf[tid], buf[tid + s]);
      __syncthreads();
    }
  }
  if (tid == 0) {
    src1[i] = buf[0];
  }
}

template <typename T>
__global__ void device_cmp_array(int *dst, T *src1, int n) {
  // reduction on the whole vector
  T buf = T(1);
  for (int i = 0; i < n; i += blockDim.x) {
    buf = nsimd::gpu_mul(buf, src1[i]);
  }
  int i = threadIdx.x + blockIdx.x * blockDim.x;
  if (i == 0) {
    dst[0] = int(buf);
  }
}

template <typename T> bool cmp(T *src1, T *src2, unsigned int n) {
  int host_ret;
  int *device_ret;
  if (cudaMalloc((void **)&device_ret, sizeof(int)) != cudaSuccess) {
    std::cerr << "ERROR: cannot cudaMalloc " << sizeof(int) << " bytes\n";
    exit(EXIT_FAILURE);
  }
  device_cmp_blocks<<<(n + 127) / 128, 128, 128 * sizeof(T)>>>(src1, src2,
                                                               int(n));
  device_cmp_array<<<(n + 127) / 128, 128>>>(device_ret, src1, int(n));
  cudaMemcpy((void *)&host_ret, (void *)device_ret, sizeof(int),
             cudaMemcpyDeviceToHost);
  cudaFree((void *)device_ret);
  return bool(host_ret);
}

template <typename T> bool cmp(T *src1, T *src2, unsigned int n, int) {
  return cmp(src1, src2, n);
}

template <typename T> void del(T *ptr) { cudaFree(ptr); }

#elif defined(NSIMD_ROCM)

// ----------------------------------------------------------------------------
// ROCm

// perform reduction on blocks first, note that this could be optimized
// but to check correctness we don't need it now
template <typename T>
__global__ void device_cmp_blocks(T *src1, T *src2, size_t n) {
  extern __shared__ char buf_[]; // size of a block
  T *buf = (T*)buf_;
  size_t tid = hipThreadIdx_x;
  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  if (i < n) {
    buf[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0);
  }

  const size_t block_start = hipBlockIdx_x * hipBlockDim_x;
  const size_t block_end = block_start + hipBlockDim_x;

  size_t size;
  if (block_end < n) {
    size = hipBlockDim_x;
  } else {
    size = n - block_start;
  }

  __syncthreads();
  for (size_t s = size / 2; s != 0; s /= 2) {
    if (tid < s && i < n) {
      buf[tid] = nsimd::gpu_mul(buf[tid], buf[tid + s]);
      __syncthreads();
    }
  }
  if (tid == 0) {
    src1[i] = buf[0];
  }
}

template <typename T>
__global__ void device_cmp_array(int *dst, T *src1, size_t n) {
  // reduction on the whole vector
  T buf = T(1);
  for (size_t i = 0; i < n; i += blockDim.x) {
    buf = nsimd::gpu_mul(buf, src1[i]);
  }
  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
  if (i == 0) {
    dst[0] = int(buf);
  }
}

template <typename T> bool cmp(T *src1, T *src2, size_t n) {
  int host_ret;
  int *device_ret;
  if (hipMalloc((void **)&device_ret, sizeof(int)) != hipSuccess) {
    return false;
  }
  hipLaunchKernelGGL(device_cmp_blocks, (n + 127) / 128, 128, 128 * sizeof(T),
                     0, src1, src2, n);
  hipLaunchKernelGGL(device_cmp_array, (n + 127) / 128, 128, 0, 0, device_ret,
                     src1, n);
  hipMemcpy((void *)&host_ret, (void *)device_ret, sizeof(int),
            hipMemcpyDeviceToHost);
  hipFree((void *)device_ret);
  return bool(host_ret);
}

template <typename T> bool cmp(T *src1, T *src2, size_t n, int) {
  return cmp(src1, src2, n);
}

template <typename T> void del(T *ptr) { hipFree(ptr); }

#elif defined(NSIMD_ONEAPI)

// ----------------------------------------------------------------------------
// oneAPI

// perform reduction on blocks first, note that this could be optimized
// but to check correctness we don't need it now
template <typename T>
void device_cmp_blocks(T *const src1, const T *const src2, const size_t n,
                       sycl::accessor<T, 1, sycl::access::mode::read_write,
                                      sycl::access::target::local>
                           local_buffer,
                       sycl::nd_item<1> item) {
  size_t tid = item.get_local_id().get(0);
  size_t i = item.get_global_id().get(0);

  if (i < n) {
    local_buffer[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0);
  }

  item.barrier(sycl::access::fence_space::local_space);

  // other approach: see book p 345
  if (tid == 0) {
    sycl::ext::oneapi::sub_group sg = item.get_sub_group();
    src1[i] = sycl::ext::oneapi::reduce_over_group(
        sg, local_buffer[0], sycl::ext::oneapi::multiplies<T>());
  }
}

template <typename T>
void device_cmp_array(int *const dst, const T *const src1, const size_t n,
                      sycl::nd_item<1> item) {
  // reduction mul on the whole vector
  T buf = T(1);
  sycl::nd_range<1> nd_range = item.get_nd_range();
  sycl::range<1> range = nd_range.get_local_range();
  for (size_t i = 0; i < n; i += range.size()) {
    buf = nsimd::gpu_mul(buf, src1[i]);
  }
  size_t i = item.get_global_id().get(0);
  if (i == 0) {
    dst[0] = int(buf);
  }
}

template <typename T>
bool cmp(T *const src1, const T *const src2, unsigned int n) {

  const size_t total_num_threads = (size_t)nsimd_kernel_param(n, 128);
  sycl::queue q = nsimd::oneapi::default_queue();

  sycl::event e1 = q.submit([=](sycl::handler &h) {
    sycl::accessor<T, 1, sycl::access::mode::read_write,
                   sycl::access::target::local>
        local_buffer(128, h);

    h.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                     sycl::range<1>(128)),
                   [=](sycl::nd_item<1> item_) {
                     device_cmp_blocks(src1, src2, size_t(n), local_buffer,
                                       item_);
                   });
  });
  e1.wait_and_throw();

  int *device_ret = nsimd::device_calloc<int>(n);
  if (device_ret == NULL) {
    std::cerr << "ERROR: cannot sycl::malloc_device " << sizeof(int)
              << " bytes\n";
    exit(EXIT_FAILURE);
  }
  sycl::event e2 =
      q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                       sycl::range<1>(128)),
                     [=](sycl::nd_item<1> item_) {
                       device_cmp_array(device_ret, src1, size_t(n), item_);
                     });
  e2.wait_and_throw();

  int host_ret;
  q.memcpy((void *)&host_ret, (void *)device_ret, sizeof(int)).wait();
  nsimd::device_free(device_ret);

  return bool(host_ret);
}

template <typename T> bool cmp(T *src1, T *src2, unsigned int n, double) {
  return cmp(src1, src2, n);
}

template <typename T> void del(T *ptr) {
  sycl::queue q = nsimd::oneapi::default_queue();
  sycl::free(ptr, q);
}

#else

// ----------------------------------------------------------------------------
// SIMD

template <typename T> bool cmp(T *src1, T *src2, unsigned int n) {
  return memcmp(src1, src2, n * sizeof(T)) == 0;
}

template <typename T> bool cmp(T *src1, T *src2, unsigned int n, int ufp) {
  for (unsigned int i = 0; i < n; i++) {
    if (nsimd::ufp(src1[i], src2[i]) < ufp) {
      return false;
    }
  }
  return true;
}

#endif

// ----------------------------------------------------------------------------

#endif


================================================
FILE: tests/nsimd-all.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>
#include <iostream>
#include <cstdlib>

// ----------------------------------------------------------------------------

int main() {
  using namespace nsimd;
  const int unroll = 3;
  typedef pack<float, unroll> upack;

  const int n_max = unroll * NSIMD_MAX_LEN(f32);
  const int n = len(upack());
  float buf[n_max];

  for(int i = 0; i < n; i++) {
    buf[i] = float(i);
  }

  upack p = loadu<upack>(buf);
  p = -(p * p) + 1.0f;
  storeu(buf, p);

  for (int i = 0; i < n; i++) {
    fprintf(stdout, "%f vs %f\n", double(buf[i]), double(-(i * i) + 1));
  }

  for (int i = 0; i < n; i++) {
    if (buf[i] != float(-(i * i) + 1)) {
      exit(EXIT_FAILURE);
    }
  }

  return 0;
}


================================================
FILE: tests/nsimd.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>
#include <nsimd/cxx_adv_api.hpp>
#include <iostream>
#include <cstdlib>

// ----------------------------------------------------------------------------

void test_native_register() {
  nsimd_cpu_vf32 a = nsimd_set1_cpu_f32(1.0f);

  nsimd::pack<f32, 1, nsimd::cpu> p1(a);
  nsimd::pack<f32, 1, nsimd::cpu> p2(1.0f);

  if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(a, p1.native_register()))) {
    exit(EXIT_FAILURE);
  }

  if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(a, nsimd::native_register(p1)))) {
    exit(EXIT_FAILURE);
  }

  if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(nsimd::native_register(a),
                                         nsimd::native_register(p1)))) {
    exit(EXIT_FAILURE);
  }

  if (nsimd_any_cpu_f32(
          nsimd_ne_cpu_f32(p2.native_register(), p1.native_register()))) {
    exit(EXIT_FAILURE);
  }
}

// ----------------------------------------------------------------------------

void test_output() {
  nsimd_cpu_vf32 a = nsimd_set1_cpu_f32(1.0f);

  if (nsimd_put_cpu_f32(stdout, NULL, a) == -1) {
    exit(EXIT_FAILURE);
  }

  if (nsimd_put_cpu_f32(stdout, "%f", a) == -1) {
    exit(EXIT_FAILURE);
  }

  fflush(stdout);

  nsimd::pack<f32, 1, nsimd::cpu> p1(a);
  nsimd::pack<f32, 1, nsimd::cpu> p2(1.0f);

  std::cout << p1 << std::endl << p2 << std::endl;
}

// ----------------------------------------------------------------------------

void test_unroll() {
  using namespace nsimd;
  const int unroll = 3;
  typedef pack<float, unroll> upack;

  const int n_max = unroll * NSIMD_MAX_LEN(f32);
  const int n = len(upack());
  float buf[n_max];

  for(int i = 0; i < n; i++) {
    buf[i] = float(i);
  }

  upack p = loadu<upack>(buf);
  p = -(p * p);
  storeu(buf, p);

  for (int i = 0; i < n; i++) {
    fprintf(stdout, "%f vs %f\n", double(buf[i]), double(-i * i));
  }

  for (int i = 0; i < n; i++) {
    if (buf[i] != float(-(i * i))) {
      exit(EXIT_FAILURE);
    }
  }
}

// ----------------------------------------------------------------------------

int main(void) {
  test_native_register();
  test_output();
  test_unroll();
  return 0;
}


================================================
FILE: tests/nsimd.prec11.c
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>

int main(void) {
  return 0;
}


================================================
FILE: tests/operator_vector_scalar.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>
#include <iostream>

int main() {
  nsimd::pack<float> a(1.0f);
  return (nsimd::any(a != 0) != 0 ? 0 : 1);
}


================================================
FILE: tests/shifts.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>
#include <cstdlib>

// ----------------------------------------------------------------------------

int main() {
  using namespace nsimd;
  const int unroll = 3;
  typedef pack<unsigned int, unroll> upack;

  const int n_max = unroll * NSIMD_MAX_LEN(f32);
  const int n = len(upack());
  unsigned int buf[n_max];

  for(int i = 0; i < n; i++) {
    buf[i] = (unsigned int)i;
  }
  upack v = loadu<upack>(buf);

  if (any(((v << 4) >> 4) != v)) {
    exit(EXIT_FAILURE);
  }

  return 0;
}


================================================
FILE: tests/templated_loads_stores.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd-all.hpp>
#include <iostream>
#include <cstdlib>

// ----------------------------------------------------------------------------

float *getmem(nsimd::aligned, int sz) {
  float *ret = (float *)nsimd::aligned_alloc(sz);
  if (ret == NULL) {
    std::cerr << "ERROR: cannot malloc aligned memory" << std::endl;
  }
  return ret;
}

float *getmem(nsimd::unaligned, int sz) {
  return getmem(nsimd::aligned(), 2 * sz) + 1;
}

// ----------------------------------------------------------------------------

template <typename Alignment> int test() {
  using namespace nsimd;

  f32 *buf = getmem(Alignment(), NSIMD_MAX_LEN(f32));
  memset((void *)buf, 0, NSIMD_MAX_LEN(f32));

  pack<f32> v =
      masko_load<Alignment>(packl<f32>(false), buf, set1<pack<f32> >(1.0f));

  if (any(v != 1.0f)) {
    std::cerr << "[1]: v != [ 1.0f ... 1.0f ]" << std::endl;
    return -1;
  }

  v = load<pack<f32>, Alignment>(buf);
  if (any(v != 0.0f)) {
    std::cerr << "[2]: v != [ 0.0f ... 0.0f ]" << std::endl;
    return -1;
  }

  v = set1<pack<f32> >(1.0f);
  store<Alignment>(buf, v);
  for (int i = 0; i < len(pack<f32>()); i++) {
    if (buf[i] != 1.0f) {
      std::cerr << "[3]: buf != [ 1.0f ... 1.0f ]" << std::endl;
      return -1;
    }
  }

  v = set1<pack<f32> >(2.0f);
  mask_store<Alignment>(packl<f32>(false), buf, v);
  for (int i = 0; i < len(pack<f32>()); i++) {
    if (buf[i] != 1.0f) {
      std::cerr << "[4]: buf != [ 1.0f ... 1.0f ]" << std::endl;
      return -1;
    }
  }

  v = maskz_load<Alignment>(packl<f32>(false), buf);
  if (any(v != 0.0f)) {
    std::cerr << "[5]: v != [ 0.0f ... 0.0f ]" << std::endl;
    return -1;
  }

  return 0;
}

// ----------------------------------------------------------------------------

int main() { return test<nsimd::aligned>() || test<nsimd::unaligned>(); }


================================================
FILE: tests/tests_helpers.hpp
================================================
#ifndef TESTS_HELPERS_HPP
#define TESTS_HELPERS_HPP

#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <nsimd/cxx_adv_api.hpp>
#include <nsimd/nsimd.h>

#define NSIMD_LOG_DEBUG 0

#define NSIMD_MAX_REGISTER_SIZE_BYTES NSIMD_MAX_LEN_BIT / 8

#define LOG_TEST_DEBUG(test_name, T)                                          \
  do {                                                                        \
    if (NSIMD_LOG_DEBUG) {                                                    \
      fprintf(stdout, "%s%s%s%s%s", "\n--------- ",                           \
              nsimd_tests::get_type_str(T()), ": ", test_name,                \
              "---------------\n\n");                                         \
    }                                                                         \
  } while (0)

#define LOG_MEMORY_CONTENT_DEBUG(vout, len_, memory_type)                     \
  do {                                                                        \
    if (NSIMD_LOG_DEBUG) {                                                    \
      nsimd_tests::print(vout, len_, memory_type);                            \
    }                                                                         \
  } while (0)

#define CHECK(a)                                                              \
  {                                                                           \
    errno = 0;                                                                \
    if (!(a)) {                                                               \
      fprintf(stderr, "ERROR: " #a ":%d: %s\n", __LINE__, strerror(errno));   \
      fflush(stderr);                                                         \
      exit(EXIT_FAILURE);                                                     \
    }                                                                         \
  }

#define TEST_NSIMD_FALSE 0
#define TEST_NSIMD_TRUE 1
#define TEST_NSIMD_ERROR -1

/* ----------------------------------------------------------------------- */

namespace nsimd_tests {

template <typename T>
int expected_not_equal_computed(const T expected, const T computed) {
  return expected != computed;
}

namespace fprintf_helper {
// silent the warning for implicit conversion from ‘float’ to ‘double’ when
// passing argument to fprintf
template <typename T> struct f64_if_f32_else_T { typedef T value_type; };
template <> struct f64_if_f32_else_T<f32> { typedef f64 value_type; };

const char *specifier(i8) { return "%hhu"; }
const char *specifier(u8) { return "%hhu"; }
const char *specifier(i16) { return "%hd"; }
const char *specifier(u16) { return "%hu"; }
const char *specifier(i32) { return "%d"; }
const char *specifier(u32) { return "%u"; }
const char *specifier(i64) { return "%ld"; }
const char *specifier(u64) { return "%lu"; }
const char *specifier(f32) { return "%f"; }
const char *specifier(f64) { return "%f"; }

} // namespace fprintf_helper

const char *get_type_str(i8) { return "i8"; }
const char *get_type_str(u8) { return "u8"; }
const char *get_type_str(i16) { return "i16"; }
const char *get_type_str(u16) { return "u16"; }
const char *get_type_str(i32) { return "i32"; }
const char *get_type_str(u32) { return "u32"; }
const char *get_type_str(i64) { return "i64"; }
const char *get_type_str(u64) { return "u64"; }
const char *get_type_str(f32) { return "f32"; }
const char *get_type_str(f64) { return "f64"; }

template <typename T>
void print(T *const arr, const nsimd_nat len_, const char *msg) {
  fprintf(stdout, "%-24s: ", msg);
  char formatter[12];
  strcpy(formatter, "%s");
  strcat(formatter, fprintf_helper::specifier(T()));
  for (nsimd_nat ii = 0; ii < len_; ++ii) {
    fprintf(
        stdout, formatter, 0 == ii ? "{" : ", ",
        (typename fprintf_helper::f64_if_f32_else_T<T>::value_type)arr[ii]);
  }
  fprintf(stdout, "%s", "}\n");
  fflush(stdout);
}

template <typename T>
void init_arrays(T *const vout_expected, T *const vout_computed,
                 const nsimd_nat len_) {
  for (nsimd_nat ii = 0; ii < len_; ++ii) {
    vout_expected[ii] = (T)-1;
    vout_computed[ii] = (T)1;
  }
}

/* ----------------------------- storea ---------------------------- */

// storea for all packx[Y]<1 .. N> Y in {1, 2, 3, 4}

// struct storea_recurs_helper for packx[Y]<1 .. N> y in {2, 3, 4}

// General definition
template <typename T, int N, typename SimdExt,
          template <typename, int, typename> class pack_t, int VIx,
          bool EndRecurs>
struct storea_recurs_helper {};

// Recursive case
template <typename T, int N, typename SimdExt,
          template <typename, int, typename> class pack_t, int VIx>
struct storea_recurs_helper<T, N, SimdExt, pack_t, VIx, false> {
  void operator()(T *const begin, const pack_t<T, N, SimdExt> &pack_) const {
    nsimd::storea(begin, nsimd::get_pack<VIx>(pack_));
    storea_recurs_helper<T, N, SimdExt, pack_t, VIx + 1,
                         VIx + 1 == pack_t<T, N, SimdExt>::soa_num_packs>()(
        begin + nsimd::len(nsimd::pack<T, N, SimdExt>()), pack_);
  }
};

// Base case
template <typename T, int N, typename SimdExt,
          template <typename, int, typename> class pack_t, int VIx>
struct storea_recurs_helper<T, N, SimdExt, pack_t, VIx, true> {
  void operator()(T *const begin, const pack_t<T, N, SimdExt> &pack_) const {
    (void)begin;
    (void)pack_;
  }
};

// storea function for packx[Y]<1 .. N> y in {2, 3, 4}
template <typename T, int N, typename SimdExt,
          template <typename, int, typename> class pack_t>
void storea__(T *const begin, const pack_t<T, N, SimdExt> &pack_) {
  storea_recurs_helper<T, N, SimdExt, pack_t, 0,
                       0 == pack_t<T, N, SimdExt>::soa_num_packs>()(begin,
                                                                    pack_);
}

// storea for pack<1 .. N>
template <typename T, int N, typename SimdExt>
void storea__(T *const begin, const nsimd::pack<T, N, SimdExt> &pack_) {
  nsimd::storea(begin, pack_);
}

/* ---------------------- check_arrays ------------------------------- */

template <typename T>
bool check_arrays(const T *const vout_expected, const T *const vout_computed,
                  const nsimd_nat len_) {
  for (nsimd_nat ii = 0; ii < len_; ++ii) {
    if (expected_not_equal_computed(vout_expected[ii], vout_computed[ii])) {
      fprintf(stdout, STATUS "... FAIL\n");
      fflush(stdout);
      return 0;
    }
  }
  return 1;
}

/* ---------------------- check_packs_content ------------------------ */

template <typename T, int N_From, int N_To, typename SimdExt,
          template <typename, int, typename> class PackFrom,
          template <typename, int, typename> class PackTo>
bool check_pack_expected_vs_computed(
    const PackFrom<T, N_From, SimdExt> &pack_from,
    const PackTo<T, N_To, SimdExt> &pack_to, const char *from_type,
    const char *to_type, T *const vout_expected, T *const vout_computed) {

  if (nsimd::len(pack_from) != nsimd::len(pack_to)) {
    return 0;
  }
  const nsimd_nat len_ = (nsimd_nat)(nsimd::len(pack_to));
  init_arrays(vout_expected, vout_computed, len_);

  storea__(vout_expected, pack_from);
  LOG_MEMORY_CONTENT_DEBUG(vout_expected, nsimd::len(pack_from), from_type);

  nsimd::storea(vout_computed, pack_to);
  LOG_MEMORY_CONTENT_DEBUG(vout_computed, nsimd::len(pack_to), to_type);

  if (!check_arrays(vout_expected, vout_computed, len_)) {
    return 0;
  }

  return 1;
}

} // namespace nsimd_tests

#endif


================================================
FILE: tests/to_pack.cpp
================================================
#define STATUS "test of to_pack over all types"

#include "tests_helpers.hpp"

template <typename T> bool to_pack_from_pack_1_N_1() {

  LOG_TEST_DEBUG("to_pack_from_pack_1_N_1", T);

  nsimd::pack<T, 1> pack_from(42);
  nsimd::pack<T, 1> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::pack<T, 1>", "nsimd::pack<T, 1>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx2_N_1() {

  LOG_TEST_DEBUG("to_pack_from_packx2_N_1", T);

  nsimd::pack<T, 1> v0(42);
  nsimd::pack<T, 1> v1(24);

  nsimd::packx2<T, 1> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;

  nsimd::pack<T, 2> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx2<T, 1>", "nsimd::pack<T, 2>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx3_N_1() {

  LOG_TEST_DEBUG("to_pack_from_packx3_N_1", T);

  nsimd::pack<T, 1> v0(42);
  nsimd::pack<T, 1> v1(24);
  nsimd::pack<T, 1> v2(66);

  nsimd::packx3<T, 1> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;

  nsimd::pack<T, 3> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx3<T, 1>", "nsimd::pack<T, 3>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx2_N_2() {

  LOG_TEST_DEBUG("to_pack_from_packx2_N_2", T);

  nsimd::pack<T, 2> v0(42);
  nsimd::pack<T, 2> v1(24);

  nsimd::packx2<T, 2> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;

  nsimd::pack<T, 4> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx2<T, 2>", "nsimd::pack<T, 4>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx2_N_3() {

  LOG_TEST_DEBUG("to_pack_from_packx2_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);

  nsimd::packx2<T, 3> pack_from;

  pack_from.v0 = v0;
  pack_from.v1 = v1;

  nsimd::pack<T, 6> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx2<T, 3>", "nsimd::pack<T, 6>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx3_N_2() {

  LOG_TEST_DEBUG("to_pack_from_packx3_N_2", T);

  nsimd::pack<T, 2> v0(42);
  nsimd::pack<T, 2> v1(24);
  nsimd::pack<T, 2> v2(66);

  nsimd::packx3<T, 2> pack_from;

  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;

  nsimd::pack<T, 6> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx3<T, 2>", "nsimd::pack<T, 6>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx3_N_3() {

  LOG_TEST_DEBUG("to_pack_from_packx3_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);
  nsimd::pack<T, 3> v2(66);

  nsimd::packx3<T, 3> pack_from;

  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;

  nsimd::pack<T, 9> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx3<T, 3>", "nsimd::pack<T, 9>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx4_N_1() {

  LOG_TEST_DEBUG("to_pack_from_packx4_N_1", T);

  nsimd::pack<T, 1> v0(42);
  nsimd::pack<T, 1> v1(24);
  nsimd::pack<T, 1> v2(66);
  nsimd::pack<T, 1> v3(132);

  nsimd::packx4<T, 1> pack_from;

  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;
  pack_from.v3 = v3;

  nsimd::pack<T, 4> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx4<T, 1>", "nsimd::pack<T, 4>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx4_N_2() {

  LOG_TEST_DEBUG("to_pack_from_packx4_N_2", T);

  nsimd::pack<T, 2> v0(42);
  nsimd::pack<T, 2> v1(24);
  nsimd::pack<T, 2> v2(66);
  nsimd::pack<T, 2> v3(132);

  nsimd::packx4<T, 2> pack_from;

  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;
  pack_from.v3 = v3;

  nsimd::pack<T, 8> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx4<T, 2>", "nsimd::pack<T, 8>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_from_packx4_N_3() {

  LOG_TEST_DEBUG("to_pack_from_packx4_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);
  nsimd::pack<T, 3> v2(66);
  nsimd::pack<T, 3> v3(132);

  nsimd::packx4<T, 3> pack_from;

  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;
  pack_from.v3 = v3;

  nsimd::pack<T, 12> pack_to = nsimd::to_pack(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::packx4<T, 3>", "nsimd::pack<T, 12>",
      expected.get(), computed.get());
}

template <typename T> bool test_all() {

  if (!to_pack_from_pack_1_N_1<T>()) {
    return 0;
  }
  if (!to_pack_from_packx2_N_1<T>()) {
    return 0;
  }
  if (!to_pack_from_packx2_N_2<T>()) {
    return 0;
  }
  if (!to_pack_from_packx2_N_3<T>()) {
    return 0;
  }
  if (!to_pack_from_packx3_N_1<T>()) {
    return 0;
  }
  if (!to_pack_from_packx3_N_2<T>()) {
    return 0;
  }
  if (!to_pack_from_packx3_N_3<T>()) {
    return 0;
  }
  if (!to_pack_from_packx4_N_1<T>()) {
    return 0;
  }
  if (!to_pack_from_packx4_N_2<T>()) {
    return 0;
  }
  if (!to_pack_from_packx4_N_3<T>()) {
    return 0;
  }
  return 1;
}

int main(void) {

  if (!test_all<i8>() || !test_all<u8>() || !test_all<i16>() ||
      !test_all<u16>() || !test_all<i32>() || !test_all<u32>() ||
      !test_all<i64>() || !test_all<u64>() || !test_all<f32>() ||
      !test_all<f64>()) {
    return -1;
  }

  fprintf(stdout, STATUS "... OK\n");
  fflush(stdout);
  return 0;
}


================================================
FILE: tests/to_pack_interleave.cpp
================================================
#define STATUS "test of to_pack_interleave over all types"

#include "tests_helpers.hpp"

template <typename T> bool to_pack_interleave_from_pack_1_N_1() {

  LOG_TEST_DEBUG("to_pack_interleave_from_pack_1_N_1", T);

  nsimd::pack<T, 1> pack_from(42);
  nsimd::pack<T, 1> pack_to = nsimd::to_pack_interleave(pack_from);

  nsimd::scoped_aligned_mem_for<T>
    expected(NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(NSIMD_MAX_REGISTER_SIZE_BYTES);

  return nsimd_tests::check_pack_expected_vs_computed(
      pack_from, pack_to, "nsimd::pack<T, 1>", "nsimd::pack<T, 1>",
      expected.get(), computed.get());
}

template <typename T> bool to_pack_interleave_from_packx2_N_1() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx2_N_1", T);

  nsimd::pack<T, 1> v0(42);
  nsimd::pack<T, 1> v1(24);

  nsimd::packx2<T, 1> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;

  nsimd::scoped_aligned_mem_for<T>
    expected(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx2<T, 1>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));
  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx2<T, 1>");

  nsimd::pack<T, 2> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 2>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool to_pack_interleave_from_packx2_N_2() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx2_N_2", T);

  nsimd::pack<T, 2> v0(42);
  nsimd::pack<T, 2> v1(24);

  nsimd::packx2<T, 2> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;

  nsimd::scoped_aligned_mem_for<T>
    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx2<T, 2>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx2<T, 2>");

  nsimd::pack<T, 4> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 4>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool to_pack_interleave_from_packx3_N_2() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx3_N_2", T);

  nsimd::pack<T, 2> v0(42);
  nsimd::pack<T, 2> v1(24);
  nsimd::pack<T, 2> v2(66);

  nsimd::packx3<T, 2> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;

  nsimd::scoped_aligned_mem_for<T>
    expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx3<T, 2>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx3<T, 2>");

  nsimd::pack<T, 6> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 6>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool to_pack_interleave_from_packx3_N_3() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx3_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);
  nsimd::pack<T, 3> v2(66);

  nsimd::packx3<T, 3> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;

  nsimd::scoped_aligned_mem_for<T>
    expected(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx3<T, 3>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.cdr.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx3<T, 3>");

  nsimd::pack<T, 9> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 9>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool to_pack_interleave_from_packx4_N_1() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx4_N_1", T);

  nsimd::pack<T, 1> v0(42);
  nsimd::pack<T, 1> v1(24);
  nsimd::pack<T, 1> v2(66);
  nsimd::pack<T, 1> v3(132);

  nsimd::packx4<T, 1> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;
  pack_from.v3 = v3;

  nsimd::scoped_aligned_mem_for<T>
    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx4<T, 1>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx4<T, 1>");

  nsimd::pack<T, 4> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 4>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool to_pack_interleave_from_packx4_N_2() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx4_N_2", T);

  nsimd::pack<T, 2> v0(42);
  nsimd::pack<T, 2> v1(24);
  nsimd::pack<T, 2> v2(66);
  nsimd::pack<T, 2> v3(132);

  nsimd::packx4<T, 2> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;
  pack_from.v3 = v3;

  nsimd::scoped_aligned_mem_for<T>
    expected(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx4<T, 2>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.cdr.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx4<T, 2>");

  nsimd::pack<T, 8> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 8>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool to_pack_interleave_from_packx4_N_3() {

  LOG_TEST_DEBUG("to_pack_interleave_from_packx4_N_3", T);

  nsimd::pack<T, 3> v0(42);
  nsimd::pack<T, 3> v1(24);
  nsimd::pack<T, 3> v2(66);
  nsimd::pack<T, 3> v3(132);

  nsimd::packx4<T, 3> pack_from;
  pack_from.v0 = v0;
  pack_from.v1 = v1;
  pack_from.v2 = v2;
  pack_from.v3 = v3;

  nsimd::scoped_aligned_mem_for<T>
    expected(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);
  nsimd::scoped_aligned_mem_for<T>
    computed(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);

  const int len_ = nsimd::len(nsimd::packx4<T, 3>());
  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);

  T *begin = expected.get();
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.cdr.car));

  begin += nsimd::len(nsimd::pack<T, 1>());
  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.cdr.cdr.car));

  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx4<T, 3>");

  nsimd::pack<T, 12> pack_to = nsimd::to_pack_interleave(pack_from);
  nsimd::storea(computed.get(), pack_to);

  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack<T, 12>");

  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);
}

template <typename T> bool test_all() {
  if (!to_pack_interleave_from_pack_1_N_1<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx2_N_1<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx2_N_2<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx3_N_2<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx3_N_3<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx4_N_1<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx4_N_2<T>()) {
    return 0;
  }
  if (!to_pack_interleave_from_packx4_N_3<T>()) {
    return 0;
  }
  return 1;
}

int main(void) {

  if (!test_all<i8>() || !test_all<u8>() || !test_all<i16>() ||
      !test_all<u16>() || !test_all<i32>() || !test_all<u32>() ||
      !test_all<i64>() || !test_all<u64>() || !test_all<f32>() ||
      !test_all<f64>()) {
    return -1;
  }

  fprintf(stdout, STATUS "... OK\n");
  fflush(stdout);
  return 0;
}


================================================
FILE: tests/ufp.cpp
================================================
/*

Copyright (c) 2019 Agenium Scale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#include <nsimd/nsimd.h>
#include <cstdlib>

// ----------------------------------------------------------------------------

template <typename U> U randbits() {
  U ret = 0;
  U mask = ((U)1 << CHAR_BIT) - 1;
  for (int i = 0; i < (int)sizeof(U); i++) {
    ret = (U)(ret | (U)((((U)rand()) & mask) << (CHAR_BIT * i)));
  }
  return ret;
}

// ----------------------------------------------------------------------------

template <typename U> int log_std_ulp(U a, U b) {
  U d = (U)(a < b ? b - a : a - b);
  int i = 0;
  for (; i < 63 && d >= (U)1 << i; i++)
    ;
  return i;
}

// ----------------------------------------------------------------------------

template <typename T> struct mantissa{};
template <> struct mantissa<f64> { static const int size = 53; };
template <> struct mantissa<f32> { static const int size = 24; };
template <> struct mantissa<f16> { static const int size = 11; };

// ----------------------------------------------------------------------------

template <typename T, typename U>
int test_ufp(int n) {
  T a = nsimd::scalar_cvt(T(), (U)1);
  U ua = nsimd::scalar_reinterpret(U(), a);
  T ap1 = nsimd::scalar_reinterpret(T(), (U)(ua + 1));
  if (nsimd::ufp(a, ap1) != mantissa<T>::size - 1) {
    return -1;
  }

  T am1 = nsimd::scalar_reinterpret(T(), (U)(ua - 1));
  if (nsimd::ufp(a, am1) != mantissa<T>::size - 1) {
    return -1;
  }

  if (nsimd::ufp(a, a) != mantissa<T>::size) {
    return -1;
  }
  if (nsimd::ufp(a, a) != mantissa<T>::size) {
    return -1;
  }
  if (nsimd::ufp(a, a) != mantissa<T>::size) {
    return -1;
  }

  T ax4 = nsimd::scalar_cvt(T(), (U)4);
  if (nsimd::ufp(a, ax4) != 0) {
    return -1;
  }

  U mask = (U)1 << (mantissa<T>::size - 1);
  U exponent = (U)((~mask) & ua);
  for (int i = 0; i < n; i++) {
    U ub = exponent | (randbits<U>() & mask);
    T b = nsimd::scalar_reinterpret(T(), ub);
    U uc = exponent | (randbits<U>() & mask);
    T c = nsimd::scalar_reinterpret(T(), uc);
    if (nsimd::ufp(b, c) != mantissa<T>::size - log_std_ulp(ub, uc)) {
      return -1;
    }
  }

  return 0;
}

// ----------------------------------------------------------------------------

int main(void) {
  int n = 10000;
  return test_ufp<f64, u64>(n) || test_ufp<f32, u32>(n) ||
         test_ufp<f16, u16>(n);
}