Showing preview only (2,861K chars total). Download the full file or copy to clipboard to get everything.
Repository: agenium-scale/nsimd
Branch: master
Commit: 702f4d179ff0
Files: 148
Total size: 2.7 MB
Directory structure:
gitextract_56lzr4bw/
├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benches/
│ └── benches.hpp
├── build.nsconfig
├── doc/
│ ├── Makefile.nix
│ ├── Makefile.win
│ ├── markdown/
│ │ ├── compilers_and_versions.md
│ │ ├── concepts.md
│ │ ├── defines.md
│ │ ├── faq.md
│ │ ├── fp16.md
│ │ ├── how_tests_are_done.md
│ │ ├── memory.md
│ │ ├── modules/
│ │ │ ├── .gitignore
│ │ │ └── fixed_point/
│ │ │ └── overview.md
│ │ ├── pack.md
│ │ └── tutorial.md
│ ├── md2html.cpp
│ └── what_is_wrapped.cpp
├── egg/
│ ├── __init__.py
│ ├── common.py
│ ├── cuda.py
│ ├── experiments/
│ │ ├── gen_sleef_operators.py
│ │ ├── round-ppc.c
│ │ └── upcvt-sve.c
│ ├── gen_adv_c_api.py
│ ├── gen_adv_cxx_api.py
│ ├── gen_archis.py
│ ├── gen_base_apis.py
│ ├── gen_benches.py
│ ├── gen_doc.py
│ ├── gen_friendly_but_not_optimized.py
│ ├── gen_modules.py
│ ├── gen_scalar_utilities.py
│ ├── gen_src.py
│ ├── gen_tests.py
│ ├── get_sleef_code.py
│ ├── hatch.py
│ ├── modules/
│ │ ├── fixed_point/
│ │ │ ├── gen_doc.py
│ │ │ ├── gen_tests.py
│ │ │ └── hatch.py
│ │ ├── memory_management/
│ │ │ └── hatch.py
│ │ ├── random/
│ │ │ └── hatch.py
│ │ ├── spmd/
│ │ │ └── hatch.py
│ │ └── tet1d/
│ │ └── hatch.py
│ ├── oneapi.py
│ ├── operators.py
│ ├── platform_arm.py
│ ├── platform_cpu.py
│ ├── platform_ppc.py
│ ├── platform_x86.py
│ ├── rocm.py
│ ├── scalar.py
│ └── x86_load_store_deg234.py
├── examples/
│ ├── module_fixed_point.cpp
│ └── tutorial.cpp
├── include/
│ └── nsimd/
│ ├── c_adv_api.h
│ ├── cxx_adv_api.hpp
│ ├── cxx_adv_api_aliases.hpp
│ ├── modules/
│ │ ├── fixed_point.hpp
│ │ ├── memory_management.hpp
│ │ ├── spmd.hpp
│ │ └── tet1d.hpp
│ ├── nsimd-all.h
│ ├── nsimd-all.hpp
│ └── nsimd.h
├── scripts/
│ ├── FindNSIMD.cmake
│ ├── aarch64-linux-gnu-clang++.sh
│ ├── aarch64-linux-gnu-clang.sh
│ ├── build-tests.bat
│ ├── build-tests.sh
│ ├── build.bat
│ ├── build.sh
│ ├── ci-clang.txt
│ ├── ci-scale.txt
│ ├── ci-test.txt
│ ├── ci.sh
│ ├── compile-gmp-mpfr-for-wasm.sh
│ ├── gen_github_doc.sh
│ ├── hipcc.sh
│ ├── init-benches-deps.sh
│ ├── local-ci-rerun.ini
│ ├── local-ci.ini
│ ├── local-ci.sh
│ ├── one-liner.c
│ ├── powerpc64le-linux-gnu-clang++.sh
│ ├── powerpc64le-linux-gnu-clang.sh
│ ├── setup.bat
│ └── setup.sh
├── src/
│ ├── dd.h
│ ├── df.h
│ ├── estrin.h
│ ├── fp16.cpp
│ ├── gpu.cpp
│ ├── helperadvsimd.h
│ ├── helperavx.h
│ ├── helperavx2.h
│ ├── helperavx512f.h
│ ├── helperneon32.h
│ ├── helperpower_128.h
│ ├── helpersse2.h
│ ├── helpersve.h
│ ├── memory.cpp
│ ├── misc.h
│ ├── rempitab.c
│ ├── rename.h
│ ├── renameadvsimd.h
│ ├── renameavx.h
│ ├── renameavx2.h
│ ├── renameavx512f.h
│ ├── renameneon32.h
│ ├── renamesse2.h
│ ├── renamesse4.h
│ ├── renamesve.h
│ ├── renamevsx.h
│ ├── sleefdp.c
│ ├── sleefsimddp.c
│ ├── sleefsimddp_emulation.c
│ ├── sleefsimdsp.c
│ ├── sleefsimdsp_emulation.c
│ ├── sleefsp.c
│ └── ufp.cpp
└── tests/
├── CMakeLists.txt.sh
├── FindNSIMD.cmake.sh
├── allocator.cpp
├── assign_arith.cpp
├── booleans.cpp
├── c11_vec.c
├── cxx_adv_api_aliases.cpp
├── fp16.prec11.c
├── get_pack.cpp
├── memory.cpp
├── memory.prec11.c
├── modules/
│ └── common.hpp
├── nsimd-all.cpp
├── nsimd.cpp
├── nsimd.prec11.c
├── operator_vector_scalar.cpp
├── shifts.cpp
├── templated_loads_stores.cpp
├── tests_helpers.hpp
├── to_pack.cpp
├── to_pack_interleave.cpp
└── ufp.cpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .clang-format
================================================
Standard: Cpp03
ColumnLimit: 79
================================================
FILE: .gitignore
================================================
# Common build dirs
build*/
# Dependencies
nstools/
# Binaries
*.o
*.so
*.pyc
*.exe
*.dll
*.dylib
# Generated files
## API
src/api_*.cpp
src/api_*
## Plateform specific code
include/nsimd/arm
include/nsimd/cpu
include/nsimd/cxx_adv_api_functions.hpp
include/nsimd/friendly_but_not_optimized.hpp
include/nsimd/functions.h
include/nsimd/ppc
include/nsimd/x86
## Tests
tests/c_base
tests/cxx_base
tests/cxx_adv
tests/modules/tet1d/
tests/modules/fixed_point/
tests/modules/rand/*.cpp
tests/modules/spmd/
tests/modules/random/
## Benches
benches/cxx_adv
## Modules
include/nsimd/modules/tet1d/
include/nsimd/modules/spmd/
include/nsimd/modules/fixed_point/
include/nsimd/scalar_utilities.h
## Doc
doc/html/*
!doc/html/assets/
doc/markdown/overview.md
doc/markdown/api.md
doc/markdown/api_*.md
doc/markdown/module_fixed_point_api*.md
doc/markdown/module_fixed_point_overview.md
doc/markdown/module_spmd_api*.md
doc/markdown/module_spmd_overview.md
doc/markdown/module_memory_management_overview.md
doc/md2html
doc/tmp.html
## Ulps
ulps/
## CI
_ci/
================================================
FILE: CMakeLists.txt
================================================
# MIT License
#
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
cmake_minimum_required(VERSION 3.0.2)
project(NSIMD VERSION 3.0 LANGUAGES C CXX)
# -----------------------------------------------------------------------------
# First check that NSIMD code has been generated
if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd/functions.h")
if (WIN32)
execute_process(COMMAND
python ${CMAKE_CURRENT_SOURCE_DIR}\\egg\\hatch.py -lf)
else()
execute_process(COMMAND
python3 ${CMAKE_CURRENT_SOURCE_DIR}/egg/hatch.py -lf)
endif()
endif()
# -----------------------------------------------------------------------------
# Compilations options
option(NSIMD_ARM32_IS_ARMEL "Set whether ARM32 is in fact armel or armhf" ON)
function(nsimd_get_compiler_argument simd_ext argument)
if (MSVC)
if (CMAKE_CL_64)
set(mapping_sse2 "/DSSE2")
set(mapping_sse42 "/DSSE42")
else()
set(mapping_sse2 "/DSSE2;/arch:SSE2")
set(mapping_sse42 "/DSSE42;/arch:SSE2")
endif()
set(mapping_avx "/DAVX;/arch:AVX")
set(mapping_avx2 "/DAVX2;/arch:AVX2")
set(mapping_avx512_knl "/DAVX512_KNL;/arch:AVX512")
set(mapping_avx512_skylake "/DAVX512_SKYLAKE;/arch:AVX512")
set(mapping_neon128 "/DNEON128;/arch:VFPv4")
set(mapping_aarch64 "/DAARCH64")
set(mapping_sve "/DSVE")
set(mapping_sve128 "/DSVE128")
set(mapping_sve256 "/DSVE256")
set(mapping_sve512 "/DSVE512")
set(mapping_sve1024 "/DSVE1024")
set(mapping_sve2048 "/DSVE2048")
set(mapping_vmx "/DVMX")
set(mapping_vsx "/DVSX")
set(mapping_cuda "/DCUDA")
set(mapping_rocm "/DROCM")
set(mapping_oneapi "/ONEAPI")
else()
set(mapping_sse2 "-DSSE2;-msse2" )
set(mapping_sse42 "-DSSE42;-msse4.2" )
set(mapping_avx "-DAVX;-mavx;-mno-avx256-split-unaligned-load"
";-mno-avx256-split-unaligned-store" )
set(mapping_avx2 "-DAVX2;-mavx2;-mfma;-mno-avx256-split-unaligned-load"
";-mno-avx256-split-unaligned-store" )
set(mapping_avx512_knl "-DAVX512_KNL;-mavx512f;-mavx512pf;-mavx512er"
";-mavx512cd")
set(mapping_avx512_skylake "-DAVX512_SKYLAKE;-mavx512f;-mavx512dq"
";-mavx512cd;-mavx512bw;-mavx512vl")
if (NSIMD_ARM32_IS_ARMEL)
set(mapping_neon128 "-DNEON128;-mfloat-abi=softfp;-mfpu=neon")
else()
set(mapping_neon128 "-DNEON128;-mfpu=neon")
endif()
set(mapping_aarch64 "-DAARCH64")
set(mapping_sve "-DSVE;-march=armv8.2-a+sve")
set(mapping_sve128 "-DSVE128;-march=armv8.2-a+sve;-msve-vector-bits=128")
set(mapping_sve256 "-DSVE256;-march=armv8.2-a+sve;-msve-vector-bits=256")
set(mapping_sve512 "-DSVE512;-march=armv8.2-a+sve;-msve-vector-bits=512")
set(mapping_sve1024 "-DSVE1024;-march=armv8.2-a+sve"
";-msve-vector-bits=1024")
set(mapping_sve2048 "-DSVE2048 -march=armv8.2-a+sve"
";-msve-vector-bits=2048")
set(mapping_vmx "-DVMX;-mcpu=powerpc64le;-maltivec")
set(mapping_vsx "-DVSX;-mcpu=powerpc64le;-mvsx")
set(mapping_cuda "-DCUDA")
set(mapping_rocm "-DROCM")
set(mapping_oneapi "-DONEAPI")
endif()
if (DEFINED mapping_${simd_ext})
set(${argument} "${mapping_${simd_ext}}" PARENT_SCOPE)
else()
if (MSVC)
set(${argument} "/DCPU" PARENT_SCOPE)
else()
set(${argument} "-DCPU" PARENT_SCOPE)
endif()
endif()
endfunction()
if (NOT DEFINED simd)
set(simd "cpu")
endif()
nsimd_get_compiler_argument(${simd} NSIMD_COMPILATION_OPTIONS)
# -----------------------------------------------------------------------------
# Object file selection
set(NSIMD_OBJS "fp16;gpu;memory;api_cpu;rempitab;sleefsp;sleefdp")
if ("${simd}" STREQUAL "sse2")
set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;sleef_sse2_f32;sleef_sse2_f64")
elseif ("${simd}" STREQUAL "sse42")
set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;"
"sleef_sse2_f32;sleef_sse2_f64;"
"sleef_sse42_f32;sleef_sse42_f64")
elseif ("${simd}" STREQUAL "avx")
set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;"
"sleef_sse2_f32;sleef_sse2_f64;"
"sleef_sse42_f32;sleef_sse42_f64;"
"sleef_avx_f32;sleef_avx_f64")
elseif ("${simd}" STREQUAL "avx2")
set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;"
"sleef_sse2_f32;sleef_sse2_f64;"
"sleef_sse42_f32;sleef_sse42_f64;"
"sleef_avx_f32;sleef_avx_f64;"
"sleef_avx2_f32;sleef_avx2_f64")
elseif ("${simd}" STREQUAL "avx512_knl")
set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2"
"sleef_sse2_f32;sleef_sse2_f64;"
"sleef_sse42_f32;sleef_sse42_f64;"
"sleef_avx_f32;sleef_avx_f64;"
"sleef_avx2_f32;sleef_avx2_f64;"
"api_avx512_knl;sleef_avx512_knl_f32;sleef_avx512_knl_f64")
elseif ("${simd}" STREQUAL "avx512_skylake")
set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;"
"api_avx512_skylake;sleef_avx512_skylake_f32;"
"sleef_sse2_f32;sleef_sse2_f64;"
"sleef_sse42_f32;sleef_sse42_f64;"
"sleef_avx_f32;sleef_avx_f64;"
"sleef_avx2_f32;sleef_avx2_f64;"
"sleef_avx512_skylake_f64")
elseif ("${simd}" STREQUAL "neon128")
set(NSIMD_OBJS "${NSIMD_OBJS};api_neon128;"
"sleef_neon128_f32;sleef_neon128_f64")
elseif ("${simd}" STREQUAL "aarch64")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;"
"sleef_aarch64_f32;sleef_aarch64_f64")
elseif ("${simd}" STREQUAL "sve")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve;"
"sleef_aarch64_f32;sleef_aarch64_f64;"
"sleef_sve_f32;sleef_sve_f64")
elseif ("${simd}" STREQUAL "sve128")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve128;"
"sleef_aarch64_f32;sleef_aarch64_f64;"
"sleef_sve128_f32;sleef_sve128_f64")
elseif ("${simd}" STREQUAL "sve256")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve256;"
"sleef_aarch64_f32;sleef_aarch64_f64;"
"sleef_sve256_f32;sleef_sve256_f64")
elseif ("${simd}" STREQUAL "sve512")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve512;"
"sleef_aarch64_f32;sleef_aarch64_f64;"
"sleef_sve512_f32;sleef_sve512_f64")
elseif ("${simd}" STREQUAL "sve1024")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve1024;"
"sleef_aarch64_f32;sleef_aarch64_f64;"
"sleef_sve1024_f32;sleef_sve1024_f64")
elseif ("${simd}" STREQUAL "sve2048")
set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve2048;"
"sleef_aarch64_f32;sleef_aarch64_f64;"
"sleef_sve2048_f32;sleef_sve2048_f64")
elseif ("${simd}" STREQUAL "vmx")
set(NSIMD_OBJS "${NSIMD_OBJS};api_vmx;sleef_vmx_f32;sleef_vmx_f64")
elseif ("${simd}" STREQUAL "vsx")
set(NSIMD_OBJS "${NSIMD_OBJS};api_vmx;api_vsx;sleef_vmx_f32;sleef_vmx_f64;"
"sleef_vsx_f32;sleef_vsx_f64")
endif()
# -----------------------------------------------------------------------------
# Rules for building the library
set(NSIMD_LIB_DEPS "")
foreach(o ${NSIMD_OBJS})
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp")
add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp")
elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c")
add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c")
elseif(("${o}" STREQUAL "sleef_neon128_f64") OR
("${o}" STREQUAL "sleef_vmx_f64"))
add_library(${o} OBJECT
"${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp_emulation.c")
elseif("${o}" STREQUAL "sleef_vmx_f32")
add_library(${o} OBJECT
"${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp_emulation.c")
elseif(o MATCHES "sleef_.*_f32")
add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp.c")
elseif(o MATCHES "sleef_.*_f64")
add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp.c")
endif()
if (MSVC)
set(sleef_cflags "/DNDEBUG;/DDORENAME=1")
else()
set(sleef_cflags "-DNDEBUG;-DDORENAME=1")
endif()
set_property(TARGET ${o} PROPERTY POSITION_INDEPENDENT_CODE ON)
target_include_directories(${o} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
if (MSVC)
target_compile_definitions(${o} PUBLIC "/D_CRT_SECURE_NO_WARNINGS")
endif()
set(buf "")
if ("${o}" STREQUAL "api_sse2")
nsimd_get_compiler_argument("sse2" buf)
elseif ("${o}" STREQUAL "api_sse42")
nsimd_get_compiler_argument("sse42" buf)
elseif ("${o}" STREQUAL "api_avx")
nsimd_get_compiler_argument("avx" buf)
elseif ("${o}" STREQUAL "api_avx2")
nsimd_get_compiler_argument("avx2" buf)
elseif ("${o}" STREQUAL "api_avx512_knl")
nsimd_get_compiler_argument("avx512_knl" buf)
elseif ("${o}" STREQUAL "api_avx512_skylake")
nsimd_get_compiler_argument("avx512_skylake" buf)
elseif ("${o}" STREQUAL "api_neon128")
nsimd_get_compiler_argument("neon128" buf)
elseif ("${o}" STREQUAL "api_aarch64")
nsimd_get_compiler_argument("aarch64" buf)
elseif ("${o}" STREQUAL "api_sve")
nsimd_get_compiler_argument("sve" buf)
elseif ("${o}" STREQUAL "api_sve128")
nsimd_get_compiler_argument("sve128" buf)
elseif ("${o}" STREQUAL "api_sve256")
nsimd_get_compiler_argument("sve256" buf)
elseif ("${o}" STREQUAL "api_sve512")
nsimd_get_compiler_argument("sve512" buf)
elseif ("${o}" STREQUAL "api_sve1024")
nsimd_get_compiler_argument("sve1024" buf)
elseif ("${o}" STREQUAL "api_sve2048")
nsimd_get_compiler_argument("sve2048" buf)
elseif ("${o}" STREQUAL "api_vmx")
nsimd_get_compiler_argument("vmx" buf)
elseif ("${o}" STREQUAL "api_vsx")
nsimd_get_compiler_argument("vsx" buf)
elseif ("${o}" STREQUAL "api_cuda")
nsimd_get_compiler_argument("cuda" buf)
elseif ("${o}" STREQUAL "api_rocm")
nsimd_get_compiler_argument("rocm" buf)
elseif ("${o}" STREQUAL "api_cpu")
nsimd_get_compiler_argument("cpu" buf)
elseif ("${o}" STREQUAL "rempitab")
list(APPEND buf "${sleef_cflags}")
elseif ("${o}" STREQUAL "sleefsp")
list(APPEND buf "${sleef_cflags}")
elseif ("${o}" STREQUAL "sleefdp")
list(APPEND buf "${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sse2_")
nsimd_get_compiler_argument("sse2" buf)
list(APPEND buf "-DNSIMD_SSE2;-DENABLE_SSE2=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sse42_")
nsimd_get_compiler_argument("sse42" buf)
list(APPEND buf "-DNSIMD_SSE42;-DENABLE_SSE4=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_avx_")
nsimd_get_compiler_argument("avx" buf)
list(APPEND buf "-DNSIMD_AVX;-DENABLE_AVX=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_avx2_")
nsimd_get_compiler_argument("avx2" buf)
list(APPEND buf "-DNSIMD_AVX2;-DENABLE_AVX2=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_avx512_knl_")
nsimd_get_compiler_argument("avx512_knl" buf)
list(APPEND buf "-DNSIMD_AVX512_KNL;-DENABLE_AVX512F=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_avx512_skylake_")
nsimd_get_compiler_argument("avx512_skylake" buf)
list(APPEND buf
"-DNSIMD_AVX512_SKYLAKE;-DENABLE_AVX512F=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_neon128_")
nsimd_get_compiler_argument("neon128" buf)
list(APPEND buf "-DNSIMD_NEON128;-DENABLE_NEON32=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_aarch64_")
nsimd_get_compiler_argument("aarch64" buf)
list(APPEND buf "-DNSIMD_AARCH64;-DENABLE_ADVSIMD=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sve_")
nsimd_get_compiler_argument("sve" buf)
list(APPEND buf "-DNSIMD_SVE;-DENABLE_SVE=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sve128_")
nsimd_get_compiler_argument("sve128" buf)
list(APPEND buf "-DNSIMD_SVE128;-DENABLE_SVE=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sve256_")
nsimd_get_compiler_argument("sve256" buf)
list(APPEND buf "-DNSIMD_SVE256;-DENABLE_SVE=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sve512_")
nsimd_get_compiler_argument("sve512" buf)
list(APPEND buf "-DNSIMD_SVE512;-DENABLE_SVE=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sve1024_")
nsimd_get_compiler_argument("sve1024" buf)
list(APPEND buf "-DNSIMD_SVE1024;-DENABLE_SVE=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_sve2048_")
nsimd_get_compiler_argument("sve2048" buf)
list(APPEND buf "-DNSIMD_SVE2048;-DENABLE_SVE=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_vmx_")
nsimd_get_compiler_argument("vmx" buf)
list(APPEND buf "-DNSIMD_VMX;-DENABLE_VSX=1;${sleef_cflags}")
elseif ("${o}" MATCHES "sleef_vsx_")
nsimd_get_compiler_argument("vsx" buf)
list(APPEND buf "-DNSIMD_VSX;-DENABLE_VSX=1;${sleef_cflags}")
else()
set(buf "")
endif()
if (NOT "${buf}" STREQUAL "")
target_compile_options(${o} PUBLIC "${buf}")
endif()
list(APPEND NSIMD_LIB_DEPS "$<TARGET_OBJECTS:${o}>")
endforeach()
set(NSIMD_LIB_TARGET "nsimd_${simd}")
add_library(${NSIMD_LIB_TARGET} SHARED ${NSIMD_LIB_DEPS})
# -----------------------------------------------------------------------------
# Installation stuff
if (WIN32)
install(TARGETS ${NSIMD_LIB_TARGET} RUNTIME DESTINATION lib
ARCHIVE DESTINATION lib)
else()
install(TARGETS ${NSIMD_LIB_TARGET} LIBRARY DESTINATION lib)
endif()
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd
DESTINATION include)
================================================
FILE: CONTRIBUTING.md
================================================
<!--
Copyright (c) 2019 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
## How to Contribute to `nsimd`?
You are welcome to contribute to `nsimd`. This document gives some details on
how to add/wrap new intrinsics. When you have finished fixing some bugs or
adding some new features, please make a pull request. One of our repository
maintainer will then merge or comment the pull request.
## Prerequisites
- Respect the philosophy of the library (see [index](index.md).)
- Basic knowledge of Python 3.
- Good knowledge of C.
- Good knowledge of C++.
- Good knowledge of SIMD programming.
## How Do I Add Support for a New Intrinsic?
### Introduction
`nsimd` currently supports the following architectures:
- `CPU`:
+ `CPU` called `CPU` in source code. This "extension" is not really one as it
is only present so that code written with `nsimd` can compile and run on
targets not supported by `nsimd` or with no SIMD.
- Intel:
+ `SSE2` called `SSE2` in source code.
+ `SSE4.2` called `SSE42` in source code.
+ `AVX` called `AVX` in source code.
+ `AVX2` called `AVX2` in source code.
+ `AVX-512` as found on KNLs called `AVX512_KNL` in source code.
+ `AVX-512` as found on Xeon Skylake CPUs called `AVX512_SKYLAKE` in source
code.
- Arm
+ `NEON` 128 bits as found on ARMv7 CPUs called `NEON128` in source code.
+ `NEON` 128 bits as found on Aarch64 CPUs called `AARCH64` in source code.
+ `SVE` called `SVE` in source code.
+ `SVE` 128 bits known at compiled time called `SVE128` in source code.
+ `SVE` 256 bits known at compiled time called `SVE256` in source code.
+ `SVE` 512 bits known at compiled time called `SVE512` in source code.
+ `SVE` 1024 bits known at compiled time called `SVE1024` in source code.
+ `SVE` 2048 bits known at compiled time called `SVE2048` in source code.
- IBM POWERPC
+ `VMX` 128 bits as found on POWER6 CPUs called `VMX` in source code.
+ `VSX` 128 bits as found on POWER7/8 CPUs called `VSX` in source code.
- NVIDIA
+ `CUDA` called `CUDA` in source code
- AMD
+ `ROCm` called `ROCM` in source code
- Intel oneAPI
+ `oneAPI` called `ONEAPI` in source code
`nsimd` currently supports the following types:
- `i8`: signed integers over 8 bits (usually `signed char`),
- `u8`: unsigned integers over 8 bits (usually `unsigned char`),
- `i16`: signed integers over 16 bits (usually `short`),
- `u16`: unsigned integers over 16 bits (usually `unsigned short`),
- `i32`: signed integers over 32 bits (usually `int`),
- `u32`: unsigned integers over 32 bits (usually `unsigned int`),
- `i64`: signed integers over 64 bits (usually `long`),
- `u64`: unsigned integers over 64 bits (usually `unsigned long`),
- `f16`: floating point numbers over 16 bits in IEEE format called `float16`
in the rest of this document
(<https://en.wikipedia.org/wiki/Half-precision_floating-point_format>),
- `f32`: floating point numbers over 32 bits (usually `float`)
- `f64`: floating point numbers over 64 bits (usually `double`),
As C and C++ do not support `float16`, `nsimd` provides its own types to handle
them. Therefore special care has to be taken when implementing
intrinsics/operators on architecures that do not natively supports them.
We will make the following misuse of language in the rest of this document.
The type taken by intrinsics is of course a SIMD vector and more precisely a
SIMD vector of chars or a SIMD vector of `short`s or a SIMD vector of `int`s…
Therefore when we will talk about an intrinsic, we will say that it takes
type `T` as arguments when it takes in fact a SIMD vector of `T`.
### Our imaginary intrinsic
We will add support to the library for the following imaginary intrinsic: given
a SIMD vector, suppose that this intrisic called `foo` takes each element `x`
of the vector and compute `1 / (1 - x) + 1 / (1 - x)^2`. Moreover suppose that
hardware vendors all propose this intrisic only for floatting point numbers as
follows:
- CPU (no intrinsics is given of course in standard C and C++)
- Intel (no intrinsics is given for `float16`s)
+ `SSE2`: no intrinsics is provided.
+ `SSE42`: `_mm_foo_ps` for `float`s and `_mm_foo_pd` for `double`s.
+ `AVX`: no intrinsics is provided.
+ `AVX2`: `_mm256_foo_ps` for `float`s and `_mm256_foo_pd` for `double`s.
+ `AVX512_KNL`: no intrinsics is provided.
+ `AVX512_SKYLAKE`: `_mm512_foo_ps` for `float`s and `_mm512_foo_pd` for
`double`s.
- ARM
+ `NEON128`: `vfooq_f16` for `float16`s, `vfooq_f32` for `float`s and no
intrinsics for `double`s.
+ `AARCH64`: same as `NEON128` but `vfooq_f64` for doubles.
+ `SVE`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
`float16`s, `float`s and `double`s.
+ `SVE128`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
`float16`s, `float`s and `double`s.
+ `SVE256`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
`float16`s, `float`s and `double`s.
+ `SVE512`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
`float16`s, `float`s and `double`s.
+ `SVE1024`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
`float16`s, `float`s and `double`s.
+ `SVE2048`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively
`float16`s, `float`s and `double`s.
- IBM POWERPC
+ `VMX`: `vec_foo` for `float`s and no intrinsics for `double`s.
+ `VSX`: `vec_foo` for `float`s and `double`s.
- NVIDIA
+ `CUDA`: no intrinsics is provided.
- AMD
+ `ROCM`: no intrinsics is provided.
- Intel oneAPI
+ `ONEAPI`: no intrinsics is provided.
First thing to do is to declare this new intrinsic to the generation system.
A lot of work is done by the generation system such as generating all functions
signatures for C and C++ APIs, tests, benchmarks and documentation. Of course
the default documentation does not say much but you can add a better
description.
### Registering the intrinsic (or operator)
A function or an intrinsic is called an operator in the generation system.
Go at the bottom of `egg/operators.py` and add the following just after
the `Rsqrt11` class.
```python
class Foo(Operator):
full_name = 'foo'
signature = 'v foo v'
types = common.ftypes
domain = Domain('R\{1}')
categories = [DocBasicArithmetic]
```
This little class will be processed by the generation system so that operator
`foo` will be available for the end-user of the library in both C and C++ APIs.
Each member of this class controls how the generation is be done:
- `full_name` is a string containing the human readable name of the operator.
If not given, the class name will be taken for it.
- `signature` is a string describing what kind of arguments and how many takes
the operator. This member is mandatory and must respect the following syntax:
`return_type name_of_operator arg1_type arg2_type ...` where `return_type`
and the `arg*_type` can be taken from the following list:
+ `v ` SIMD vector parameter
+ `vx2 ` Structure of 2 SIMD vector parameters
+ `vx3 ` Structure of 3 SIMD vector parameters
+ `vx4 ` Structure of 4 SIMD vector parameters
+ `l ` SIMD vector of logicals parameter
+ `s ` Scalar parameter
+ `* ` Pointer to scalar parameter
+ `c* ` Pointer to const scalar parameter
+ `_ ` void (only for return type)
+ `p ` Parameter (integer)
In our case `v foo v` means that `foo` takes one SIMD vector as argument and
returns a SIMD vector as output. Several signatures will be generated for this
intrinsic according to the types it can supports. In our case the intrinsic
only support floatting point types.
- `types` is a Python list indicating which types are supported by the
intrinsic. If not given, the intrinsic is supposed to support all types.
Some Python lists are predefined to help the programmer:
+ `ftypes = ['f64', 'f32', 'f16'] ` All floatting point types
+ `ftypes_no_f16 = ['f64', 'f32'] `
+ `itypes = ['i64', 'i32', 'i16', 'i8'] ` All signed integer types
+ `utypes = ['u64', 'u32', 'u16', 'u8'] ` All unsigned integer types
+ `iutypes = itypes + utypes`
+ `types = ftypes + iutypes`
- `domain` is a string indicating the mathematical domain of definition of the
operator. This helps for benchmarks and tests for generating random numbers
as inputs in the correct interval. In our case `R\{1}` means all real numbers
(of course all floating point numbers) expect `-1` for which the operator
cannot be computed. For examples see how other operators are defined in
`egg/operators.py`.
- `categories` is a list of Python classes that indicates the generation
system to which categories `foo` belongs. The list of available categories
is as follow:
+ `DocShuffle ` for Shuffle functions
+ `DocTrigo ` for Trigonometric functions
+ `DocHyper ` for Hyperbolic functions
+ `DocExpLog ` for Exponential and logarithmic functions
+ `DocBasicArithmetic ` for Basic arithmetic operators
+ `DocBitsOperators ` for Bits manipulation operators
+ `DocLogicalOperators ` for Logicals operators
+ `DocMisc ` for Miscellaneous
+ `DocLoadStore ` for Loads & stores
+ `DocComparison ` for Comparison operators
+ `DocRounding ` for Rounding functions
+ `DocConversion ` for Conversion operators
If no category corresponds to the operator you want to add to `nsimd` then feel
free to create a new category (see the bottom of this document)
Many other members are supported by the generation system. We describe them
quickly here and will give more details in a later version of this document.
Default values are given in square brakets:
- `cxx_operator [= None]` in case the operator has a corresponding C++ operator.
- `autogen_cxx_adv [= True]` in case the C++ advanced API signatures for this
operator must not be auto-generated.
- `output_to [= common.OUTPUT_TO_SAME_TYPE]` in case the operator output type
differs from its input type. Possible values are:
+ `OUTPUT_TO_SAME_TYPE`: output is of same type as input.
+ `OUTPUT_TO_SAME_SIZE_TYPES`: output can be any type of same bit size.
+ `OUTPUT_TO_UP_TYPES`: output can be any type of bit size twice the bit
bit size of the input. In this case the input type will never be a 64-bits
type.
+ `OUTPUT_TO_DOWN_TYPES`: output can be any type of bit size half the bit
bit size of the input. In this case the input type will never be a 8-bits
type.
- `src [= False]` in case the code must be compiled in the library.
- `load_store [= False]` in case the operator loads/store data from/to
memory.
- `do_bench [= True]` in case benchmarks for the operator must not be
auto-generated.
- `desc [= '']` description (in Markdown format) that will appear in the
documentation for the operator.
- `bench_auto_against_cpu [= True]` for auto-generation of benchmark against
`nsimd` CPU implementation.
- `bench_auto_against_mipp [= False]` for auto-generation of benchmark against
the MIPP library.
- `bench_auto_against_sleef [= False]` for auto-generation of benchmark against
the Sleef library.
- `bench_auto_against_std [= False]` for auto-generation of benchmark against
the standard library.
- `tests_mpfr [= False]` in case the operator has an MPFR counterpart for
comparison, then test the correctness of the operator against it.
- `tests_ulps [= False]` in case the auto-generated tests has to compare ULPs
(<https://en.wikipedia.org/wiki/Unit_in_the_last_place>).
- `has_scalar_impl [= True]` in case the operator has a CPU scalar and GPU
implementation.
### Implementing the operator
Now that the operator is registered, all signatures will be generated but
the implemenatations will be missing. Type
```sh
python3 egg/hatch.py -lf
```
and the following files (among many other) should appear:
- `include/nsimd/cpu/cpu/foo.h`
- `include/nsimd/x86/sse2/foo.h`
- `include/nsimd/x86/sse42/foo.h`
- `include/nsimd/x86/avx/foo.h`
- `include/nsimd/x86/avx2/foo.h`
- `include/nsimd/x86/avx512_knl/foo.h`
- `include/nsimd/x86/avx512_skylake/foo.h`
- `include/nsimd/arm/neon128/foo.h`
- `include/nsimd/arm/aarch64/foo.h`
- `include/nsimd/arm/sve/foo.h`
- `include/nsimd/arm/sve128/foo.h`
- `include/nsimd/arm/sve256/foo.h`
- `include/nsimd/arm/sve512/foo.h`
- `include/nsimd/arm/sve1024/foo.h`
- `include/nsimd/arm/sve2048/foo.h`
- `include/nsimd/ppc/vmx/foo.h`
- `include/nsimd/ppc/vsx/foo.h`
They each correspond to the implementations of the operator for each supported
architectures. When openening one of these files the implementations in plain
C and then in C++ (falling back to the C function) should be there but all the
C implementations are reduced to `abort();`. This is the default when none is
provided. Note that the "cpu" architecture is just a fallback involving no
SIMD at all. This is used on architectures not supported by `nsimd` or when the
architectures does not offer any SIMD.
Providing implementations for `foo` is done by completing the following Python
files:
- `egg/platform_cpu.py`
- `egg/platform_x86.py`
- `egg/platform_arm.py`
- `egg/platform_ppc.py`
- `egg/scalar.py`
- `egg/cuda.py`
- `egg/hip.py`
- `egg/oneapi.py`
The idea is to produce plain C (not C++) code using Python string format. Each
of the Python files provides some helper functions to ease as much as
possible the programmer's job. But every file provides the same "global"
variables available in every functions and is designed in the same way:
1. At the bottom of the file is the `get_impl` function taking the following
arguments:
+ `func ` the name of the operator the system is currently
auto-generating.
+ `simd_ext ` the SIMD extension for which the system wants the
implemetation.
+ `from_typ ` the input type of the argument that will be passed to the
operator.
+ `to_typ ` the output type produced by the operator.
2. Inside this function lies a Python dictionary that provides functions
implementing each operator. The string containing the C code for the
implementations can be put here directly but usually the string is
returned by a Python function that is written above in the same file.
3. At the top of the file lies helper functions that helps generating code.
This is specific to each architecture. Do not hesitate to look at it.
Let's begin by the `cpu` implementations. It turns out that there is no SIMD
extension in this case, and by convention, `simd_ext == 'cpu'` and this
argument can therefore be ignored. So we first add an entry to the `impls`
Python dictionary of the `get_impl` function:
```python
impls = {
...
'reverse': reverse1(from_typ),
'addv': addv(from_typ),
'foo': foo1(from_typ) # Added at the bottom of the dictionary
}
if simd_ext != 'cpu':
raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext))
...
```
Then, above in the file we write the Python function `foo1` that will provide
the C implementation of operator `foo`:
```python
def foo1(typ):
return func_body(
'''ret.v{{i}} = ({typ})1 / (({typ})1 - {in0}.v{{i}}) +
({typ})1 / ((({typ})1 - {in0}.v{{i}}) *
(({typ})1 - {in0}.v{{i}}));'''. \
format(**fmtspec), typ)
```
First note that the arguments names passed to the operator in its C
implementation are not known in the Python side. Several other parameters
are not known or are cumbersome to find out. Therefore each function has access
to the `fmtspec` Python dictionary that hold some of these values:
- `in0`: name of the first parameter for the C implementation.
- `in1`: name of the second parameter for the C implementation.
- `in2`: name of the third parameter for the C implementation.
- `simd_ext`: name of the SIMD extension (for the cpu architecture, this is
equal to `"cpu"`).
- `from_typ`: type of the input.
- `to_typ`: type of the output.
- `typ`: equals `from_typ`, shorter to write as usually `from_typ == to_typ`.
- `utyp`: bitfield type of the same size of `typ`.
- `typnbits`: number of bits in `typ`.
The CPU extension can emulate 64-bits or 128-bits wide SIMD vectors. Each type
is a struct containing as much members as necessary so that `sizeof(T) *
(number of members) == 64 or 128`. In order to avoid the developper to write
two cases (64-bits wide and 128-bits wide) the `func_body` function is provided
as a helper. Note that the index `{{i}}` is in double curly brackets to go
through two Python string formats:
1. The first pass is done within the `foo1` Python function and replaces
`{typ}` and `{in0}`. In this pass `{{i}}` is formatted into `{i}`.
2. The second pass is done by the `func_body` function which unrolls the string
to the necessary number and replace `{i}` by the corresponding number. The
produced C code will look like one would written the same statement for each
members of the input struct.
Then note that as plain C (and C++) does not support native 16-bits wide
floating point types `nsimd` emulates it with a C struct containing 4 floats
(32-bits swide floatting point numbers). In some cases extra care has to be
taken to handle this type.
For each SIMD extension one can find a `types.h` file (for `cpu` the files can
be found in `include/nsimd/cpu/cpu/types.h`) that declares all SIMD types. If
you have any doubt on a given type do not hesitate to take a look at this file.
Note also that this file is auto-generated and is therefore readable only after
a successfull first `python3 egg/hatch -Af`.
Now that the `cpu` implementation is written, you should be able to write the
implementation of `foo` for other architectures. Each architecture has its
particularities. We will cover them now by providing directly the Python
implementations and explaining in less details.
Finally note that `clang-format` is called by the generation system to
autoformat produced C/C++ code. Therefore prefer indenting C code strings within
the Python according to Python indentations, do not write C code beginning at
column 0 in Python files.
### For Intel
```python
def foo1(simd_ext, typ):
if typ == 'f16':
return '''nsimd_{simd_ext}_vf16 ret;
ret.v1 = {pre}foo_ps({in0}.v1);
ret.v2 = {pre}foo_ps({in0}.v2);
return ret;'''.format(**fmtspec)
if simd_ext == 'sse2':
return emulate_op1('foo', 'sse2', typ)
if simd_ext in ['avx', 'avx512_knl']:
return split_opn('foo', simd_ext, typ, 1)
return 'return {pre}foo{suf}({in0});'.format(**fmtspec)
```
Here are some notes concerning the Intel implementation:
1. `float16`s are emulated with two SIMD vectors of `float`s.
2. When the intrinsic is provided by Intel one can access it easily by
constructing it with `{pre}` and `{suf}`. Indeed all Intel intrinsics
names follow a pattern with a prefix indicating the SIMD extension and a
suffix indicating the type of data. As for `{in0}`, `{pre}` and
`{suf}` are provided and contain the correct values with respect to
`simd_ext` and `typ`, you do not need to compute them yourself.
3. When the intrinsic is not provided by Intel then one has to use tricks.
+ For `SSE2` one can use complete emulation, that is, putting the content of
the SIMD vector into a C-array, working on it with a simple for loop and
loading back the result into the resulting SIMD vector. As said before a
lot of helper functions are provided and the `emulate_op1` Python function
avoid writing by hand this for-loop emulation.
+ For `AVX` and `AVX512_KNL`, one can fallback to the "lower" SIMD extension
(`SSE42` for `AVX` and `AVX2` for `AVX512_KNL`) by splitting the input
vector into two smaller vectors belonging to the "lower" SIMD extension. In
this case again the tedious and cumbersome work is done by the `split_opn`
Python function.
4. Do not forget to add the `foo` entry to the `impls` dictionary in the `get_impl`
Python function.
### For ARM
```python
def foo1(simd_ext, typ):
ret = f16f64(simd_ext, typ, 'foo', 'foo', 1)
if ret != '':
return ret
if simd_ext in neon:
return 'return vfooq_{suf}({in0});'.format(**fmtspec)
else:
return 'return svfoo_{suf}_z({svtrue}, {in0});'.format(**fmtspec)
```
Here are some notes concerning the ARM implementation:
1. `float16`s can be natively supported but this is not mandatory.
2. On 32-bits ARM chips, intrinsics on `double` almost never exist.
3. The Python helper function `f16f64` hides a lot of details concerning the
above two points. If the function returns a non empty string then it means
that the returned string contains C code to handle the case given by the
pair `(simd_ext, typ)`. We advise you to look at the generated C code. You
will see the `nsimd_FP16` macro used. When defined it indicates that `nsimd`
is compiled with native `float16` support. This also affect SIMD types (see
`nsimd/include/arm/*/types.h`.)
4. Do not forget to add the `foo` entry to the `impls` dictionary in the
`get_impl` Python function.
### For IBM POWERPC
```python
def foo1(simd_ext, typ):
if has_to_be_emulated(simd_ext, typ):
return emulation_code(op, simd_ext, typ, ['v', 'v'])
else:
return 'return vec_foo({in0});'.format(**fmtspec)
```
Here are some notes concerning the PPC implementation:
1. For VMX, intrinsics on `double` almost never exist.
2. The Python helper function `has_to_be_emulated` returns `True` when the
implementation of `foo` concerns float16 or `double`s for `VMX`. When this
function returns True you can then use `emulation_code`.
3. The `emulation_code` function returns a generic implementation of an
operator. However this iplementation is not suitable for any operator
and the programmer has to take care of that.
4. Do not forget to add the `foo` entry to the `impls` dictionary in the
`get_impl` Python function.
### The scalar CPU version
```python
def foo1(func, typ):
normal = \
'return ({typ})(1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0})));'. \
if typ == 'f16':
return \
'''#ifdef NSIMD_NATIVE_FP16
{normal}
#else
return nsimd_f32_to_f16({normal_fp16});
#endif'''. \
format(normal=normal.format(**fmtspec),
normal_fp16=normal.format(in0='nsimd_f16_to_f32({in0})))
else:
return normal.format(**fmtspec)
```
The only caveat for the CPU scalar implementation is to handle float16
correctly. The easiest way to do is to have the same implementation as float32
but replacing `{in0}`'s by `nsimd_f16_to_f32({in0})`'s and converting back
the float32 result to a float16.
### The GPU versions
The GPU generator Python files `cuda.py`, `rocm.py` and `oneapi.py` are a bit
different from the other files but it is easy to find where to add the relevant
pieces of code. Note that ROCm syntax is fully compatible with CUDA's one only
needs to modify the `cuda.py` file while it easy to understand `oneapi.py`.
The code to add for float32's is as follows to be added inside the `get_impl`
Python function.
```python
return '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec)
```
The code for CUDA and ROCm to add for float16's is as follows. It has to be
added inside the `get_impl_f16` Python function.
```python
arch53_code = '''__half one = __float2half(1.0f);
return __hadd(
__hdiv(one, __hsub(one, {in0})),
__hmul(
__hdiv(one, __hsub(one, {in0})),
__hdiv(one, __hsub(one, {in0}))
)
);'''.format(**fmtspec)
```
As Intel oneAPI natively support float16's the code is the same as the one
for floats:
```python
return '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec)
```
### Implementing the test for the operator
Now that we have written the implementations for the `foo` operator we must
write the corresponding tests. For tests all generations are done by
`egg/gen_tests.py`. Writing tests is more simple. The intrinsic that we just
implemented can be tested by an already-written test pattern code, namely by
the `gen_test` Python function.
Here is how the `egg/gen_tests.py` is organized:
1. The entry point is the `doit` function located at the bottom of the file.
2. In the `doit` function a dispatching is done according to the operator that
is to be tested. All operators cannot be tested by the same C/C++ code. The
reading of all different kind of tests is rather easy and we are not going
through all the code in this document.
3. All Python functions generating test code begins with the following:
```python
filename = get_filename(opts, op, typ, lang)
if filename == None:
return
```
This must be the case for newly created function. The `get_filename` function
ensures that the file must be created with respect to the command line
options given to the `egg/hatch.py` script. Then note that to output to a
file the Python function `open_utf8` must be used to handle Windows and to
automatically put the MIT license at the beginning of generated files.
4. Tests must be written for C base API, the C++ base API and the C++ advanced
API.
If you need to create a new kind of tests then the best way is to copy-paste
the Python function that produces the test that resembles the most to the test
you want. Then modify the newly function to suit your needs. Here is a quick
overview of Python functions present in the `egg/gen_test.py` file:
- `gen_nbtrue`, `gen_adv`, `gen_all_any` generate tests for reduction operators.
- `gen_reinterpret_convert` generates tests for non closed operators.
- `gen_load_store` generates tests for load/store operators.
- `gen_reverse` generates tests for one type of shuffle but can be extended
for other kind of shuffles.
- `gen_test` generates tests for "standard" operators, typically those who do
some computations. This is the kind of tests that can handle our `foo`
operator and therefore nothing has to be done on our part.
## Not all tests are to be done
As explained in <how_tests_are_done.md> doing all tests is not recommanded.
Take for example the `cvt` operator. Testing `cvt` from say `f32` to `i32`
is complicated as the result depends on how NaN, infinities are handled and
on the current round mode. In turn these prameters depends on the vendor, the
chip, the bugs in the chip, the chosen rounding mode by users or other
softwares...
The function `should_i_do_the_test` gives an hint on whether to implement the
test or not. Its code is really simple and you may need to modify it. The
listing below is a possible implementation that takes care of the case
described in the previous paragraph.
```python
def should_i_do_the_test(operator, tt='', t=''):
if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes:
# When converting from float to int to float then we may not
# get the initial result because of roundings. As tests are usually
# done by going back and forth then both directions get tested in the
# end
return False
if operator.name == 'reinterpret' and t in common.iutypes and \
tt in common.ftypes:
# When reinterpreting from int to float we may get NaN or infinities
# and no ones knows what this will give when going back to ints
# especially when float16 are emulated. Again as tests are done by
# going back and forth both directions get tested in the end.
return False
if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \
t == 'f16':
# Bit operations on float16 are hard to check because they are
# emulated in most cases. Therefore going back and forth with
# reinterprets for doing bitwise operations make the bit in the last
# place to wrong. This is normal but makes testing real hard. So for
# now we do not test them on float16.
return False
if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail',
'loadu', 'loada', 'storeu', 'storea', 'loadla',
'loadlu', 'storela', 'storelu', 'if_else1']:
# These functions are used in almost every tests so we consider
# that they are extensively tested.
return False
if operator.name in ['store2a', 'store2u', 'store3a', 'store3u',
'store4a', 'store4u', 'scatter', 'scatter_linear',
'downcvt', 'to_logical']:
# These functions are tested along with their load counterparts.
# downcvt is tested along with upcvt and to_logical is tested with
# to_mask
return False
return True
```
### Conclusion
At first sight the implementation of `foo` seems complicated because intrinsics
for all types and all architectures are not provided by vendors. But `nsimd`
provides a lot of helper functions and tries to put away details so that
wrapping intrinsics is quickly done and easy, the goal is that the programmer
concentrate on the implementation itself. But be aware that more complicated
tricks can be implemented. Browse through a `platform_*.py` file to see what
kind of tricks are used and how they are implemented.
## How do I add a new category?
Adding a category is way much simplier than an operator. It suffices to add
a class with only one member named `title` as follows:
```python
class DocMyCategoryName(DocCategory):
title = 'My category name functions'
```
The class must inherit from the `DocCategory` class and its name must begin
with `Doc`. The system will then take it into account, generate the entry
in the documentation and so on.
## How to I add a new module?
A module is a set of functionnalities that make sense to be provided alongside
NSIMD but that cannot be part of NSIMD's core. Therefore it is not mandatory
to provide all C and C++ APIs versions or to support all operators. For what
follows let's call the module we want to implement `mymod`.
Include files (written by hand or generated by Python) must be placed into
the `nsimd/include/nsimd/modules/mymod` directory and a master header file must
be placed at `nsimd/include/nsimd/modules/mymod.h`. You are free to organize
the `nsimd/include/nsimd/modules/mymod` folder as you see fit.
Your module has to be found by NSIMD generation system. For this you must
create the `nsimd/egg/modules/mymod` directory and
`nsimd/egg/modules/mymod/hatch.py` file. The latter must expose the following
functions:
- `def name()`
Return a human readable module name beginning with a uppercase letter.
- `def desc()`
Return a small description of 4-5 lines of text for the module. This text
will appear in the `modules.md` file that lists all the available modules.
- `def doc_menu()`
Return a Python dictionnary containing the menu for when the generation
system produces the HTML pages of documentation for the module. The entry
markdown file must be `nsimd/doc/markdown/module_mymod_overview.md` for
module documentation. Then if your module has no other documentation
pages this function can simply returns `dict()`. Otherwise if has to return
`{'menu_label': 'filename_suffix', ...}` where `menu_label` is a menu entry
to be displayed and pointing to `nsimd/egg/module_mymod_filename_suffix.md`.
Several fucntion in `egg/common.py` (`import common`) have to be used to
ease crafting documentation pages filenames:
+ `def get_markdown_dir(opts)`
Return the folder into which markdown for documentation have to be put.
+ `def get_markdown_file(opts, name, module='')`
Return the filename to be passed to the `common.open_utf8` function. The
`name` argument acts as a suffix as explained above while the `module`
argument if the name of the module.
- `def doit(opts)`
Is the real entry point of the module. This function has the responsability
to generate all the code for your module. It can of course import all Python
files from NSIMD and take advantage of the `operators.py` file. To
respect the switches passed by the user at command line it is recommanded to
write this function as follows.
```python
def doit(opts):
common.myprint(opts, 'Generating module mymod')
if opts.library:
gen_module_headers(opts)
if opts.tests:
gen_tests(opts)
if opts.doc:
gen_doc(opts)
```
Tests for the module have to be put into the `nsimd/tests/mymod` directory.
## How to I add a new platform?
The list of supported platforms is determined by looking in the `egg`
directory and listing all `platform_*.py` files. Each file must contain all
SIMD extensions for a given platform. For example the default (no SIMD) is
given by `platform_cpu.py`. All the Intel SIMD extensions are given by
`platform_x86.py`.
Each Python file that implements a platform must be named
`platform_[name for platform].py` and must export at least the following
functions:
- `def get_simd_exts()`
Return the list of SIMD extensions implemented by this file as a Python
list.
- `def get_prev_simd_ext(simd_ext)`
Usually SIMD extensions are added over time by vendors and a chip
implementing a SIMD extension supports previous SIMD extension. This
function must return the previous SIMD extension supported by the vendor if
it exists otherwise it must return the empty string. Note that `cpu` is the
only SIMD extensions that has no previous SIMD extensions. Every other SIMD
extension has at least `cpu` as previous SIMD extension.
- `def get_native_typ(simd_ext, typ)`
Return the native SIMD type corresponding of the SIMD extension `simd_ext`
whose elements are of type `typ`. If `typ` or `simd_ext` is not known then a
ValueError exception must be raised.
- `def get_type(simd_ext, typ)`
Returns the "intrinsic" SIMD type corresponding to the given
arithmetic type. If `typ` or `simd_ext` is not known then a ValueError
exception must be raised.
- `def get_additional_include(func, simd_ext, typ)`
Returns additional include if need be for the implementation of `func` for
the given `simd_ext` and `typ`.
- `def get_logical_type(simd_ext, typ)`
Returns the "intrinsic" logical SIMD type corresponding to the given
arithmetic type. If `typ` or `simd_ext` is not known then a ValueError
exception must be raised.
- `def get_nb_registers(simd_ext)`
Returns the number of registers for this SIMD extension.
- `def get_impl(func, simd_ext, from_typ, to_typ)`
Returns the implementation (C code) for `func` on type `typ` for `simd_ext`.
If `typ` or `simd_ext` is not known then a ValueError exception must be
raised. Any `func` given satisfies `S func(T a0, T a1, ... T an)`.
- `def has_compatible_SoA_types(simd_ext)`
Returns True iff the given `simd_ext` has structure of arrays types
compatible with NSIMD i.e. whose members are v1, v2, ... Returns False
otherwise. If `simd_ext` is not known then a ValueError exception must be
raised.
- `def get_SoA_type(simd_ext, typ, deg)`
Returns the structure of arrays types for the given `typ`, `simd_ext` and
`deg`. If `simd_ext` is not known or does not name a type whose
corresponding SoA types are compatible with NSIMD then a ValueError
exception must be raised.
- `def emulate_fp16(simd_ext)`
Returns True iff the given SIMD extension has to emulate FP16's with
two FP32's.
Then you are free to implement the SIMd extensions for the platform. See above
on how to add the implementations of operators.
================================================
FILE: LICENSE
================================================
Copyright (c) 2019 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
Documentation can be found [here](https://agenium-scale.github.io/nsimd/).
We put a lot of effort into
[testing](https://agenium-scale.github.io/nsimd/how_tests_are_done.html).
# What is NSIMD?
At its core, NSIMD is a vectorization library that abstracts [SIMD
programming](<https://en.wikipedia.org/wiki/SIMD>). It was designed to exploit
the maximum power of processors at a low development cost. NSIMD comes with
modules. As of now two of them adds support for GPUs to NSIMD. The
direction that NSIMD is taking is to provide several programming paradigms
to address different problems and to allow a wider support of architectures.
With two of its modules NSIMD provides three programming paradigms:
- Imperative programming provided by NSIMD core that supports a lots of
CPU/SIMD extensions.
- Expressions templates provided by the TET1D module that supports all
architectures from NSIMD core and adds support for NVIDIA and AMD GPUs.
- Single Program Multiple Data provided by the SPMD module that supports all
architectures from NSIMD core and adds support for NVIDIA and AMD GPUs.
## Supported architectures
| Architecture | NSIMD core | TET1D module | SPMD module |
|:--------------------------------------|:----------:|:------------:|:-----------:|
| CPU (scalar functions) | Y | Y | Y |
| CPU (128-bits SIMD emulation) | Y | Y | Y |
| Intel SSE 2 | Y | Y | Y |
| Intel SSE 4.2 | Y | Y | Y |
| Intel AVX | Y | Y | Y |
| Intel AVX2 | Y | Y | Y |
| Intel AVX-512 for KNLs | Y | Y | Y |
| Intel AVX-512 for Skylake processors | Y | Y | Y |
| Arm NEON 128 bits (ARMv7 and earlier) | Y | Y | Y |
| Arm NEON 128 bits (ARMv8 and later) | Y | Y | Y |
| Arm SVE (original sizeless SVE) | Y | Y | Y |
| Arm fixed sized SVE | Y | Y | Y |
| IBM POWERPC VMX | Y | Y | Y |
| IBM POWERPC VSX | Y | Y | Y |
| NVIDIA CUDA | N | Y | Y |
| AMD ROCm | N | Y | Y |
| Intel oneAPI | N | Y | Y |
## Contributions
| Contributor | Contribution(s) |
|:---------------------|:--------------------------------------------------|
| Guillaume Quintin | Maintainer + main contributor |
| Alan Kelly | Arm NEON + mathematical functions |
| Kenny Péou | Fixed point module |
| Xavier Berault | PowerPC VMX and VSX |
| Vianney Stricher | NSIMD core + oneAPI in SPMD and TET1D modules |
| Quentin Khan | Soa/AoS loads and stores |
| Paul Gannay | PowerPC VMX, VSX + testing system |
| Charly Chevalier | Benchmarking system + Python internals |
| Erik Schnetter | Fixes + code generation |
| Lénaïc Bagnères | Fixes + TET1D module |
| Jean-Didier Pailleux | Shuffles operators |
## How it works?
To achieve maximum performance, NSIMD mainly relies on the inline optimization
pass of the compiler. Therefore using any mainstream compiler such as GCC,
Clang, MSVC, XL C/C++, ICC and others with NSIMD will give you a zero-cost SIMD
abstraction library.
To allow inlining, a lot of code is placed in header files. *Small* functions
such as addition, multiplication, square root, etc, are all present in header
files whereas big functions such as I/O are put in source files that are
compiled as a `.so`/`.dll` library.
NSIMD provides C89, C11, C++98, C++11, C++14 and C++20 APIs. All APIs allow
writing generic code. For the C API this is achieved through a thin layer of
macros and with the `_Generic` keyword for the C advanced API; for the C++ APIs
it is achieved using templates and function overloading. The C++ APIs are split
into two. The first part is a C-like API with only function calls and direct
type definitions for SIMD types while the second one provides operator
overloading, higher level type definitions that allows unrolling. C++11, C++14
APIs add for instance templated type definitions and templated constants while
the C++20 API uses concepts for better error reporting.
Binary compatibility is guaranteed by the fact that only a C ABI is exposed.
The C++ API only wraps the C calls.
## Supported compilers
NSIMD is tested with GCC, Clang, MSVC, NVCC, HIPCC and ARMClang. As a C89 and a
C++98 API are provided, other compilers should work fine. Old compiler versions
should work as long as they support the targeted SIMD extension. For instance,
NSIMD can compile SSE 4.2 code with MSVC 2010.
# Build the library
## CMake
As CMake is widely used as a build system, we have added support for building
the library only and the corresponding find module.
```sh
mkdir build
cd build
cmake .. -Dsimd=SIMD_EXT
make
make install
```
where `SIMD_EXT` is one of the following: CPU, SSE2, SSE42, AVX, AVX2,
AVX512\_KNL, AVX512\_SKYLAKE, NEON128, AARCH64, SVE, SVE128, SVE256, SVE512,
SVE1024, SVE2048, VMX, VSX, CUDA, ROCM.
Note that when compiling for NEON128 on Linux one has to choose the ABI, either
armel or armhf. Default is armel. As CMake is unable to autodetect this
parameter one has to tell CMake manually.
```sh
cmake .. -Dsimd=neon128 # for armel
cmake .. -Dsimd=neon128 -DNSIMD_ARM32_IS_ARMEL=OFF # for armhf
```
We provide in the `scripts` directory a CMake find module to find NSIMD on
your system. One can let the module find NSIMD on its own, if several
versions for different SIMD extensions of NSIMD are installed then the
module will find and return one. There is no guaranty on which versions will
be chosen by the module.
```cmake
find_package(NSIMD)
```
If one wants a specific version of the library for a given SIMD extension then
use the `COMPONENTS` part of `find_package`. Only one component is supported
at a time.
```cmake
find_package(NSIMD COMPONENTS avx2) # find only NSIMD for Intel AVX2
find_package(NSIMD COMPONENTS sve) # find only NSIMD for Arm SVE
find_package(NSIMD COMPONENTS sse2 sse42) # unsupported
```
## Nsconfig
The support for CMake has been limited to building the library only. If you
wish to run tests or contribute you need to use nsconfig as CMake has several
flaws:
- too slow especially on Windows,
- inability to use several compilers at once,
- inability to have a portable build system,
- very poor support for portable compilation flags,
- ...
## Dependencies (nsconfig only)
Generating C/C++ files is done by the Python3 code contained in the `egg`.
Python should be installed by default on any Linux distro. On Windows it comes
with the latest versions of Visual Studio on Windows
(<https://visualstudio.microsoft.com/vs/community/>), you can also download and
install it directly from <https://www.python.org/>.
The Python code can call `clang-format` to properly format all generated C/C++
source. On Linux you can install it via your package manager. On Windows you
can use the official binary at <https://llvm.org/builds/>.
Compiling the library requires a C++98 compiler. Any version of GCC, Clang or
MSVC will do. Note that the produced library and header files for the end-user
are C89, C++98, C++11 compatible. Note that C/C++ files are generated by a
bunch of Python scripts and they must be executed first before running building
the library.
## Build for Linux
```bash
bash scripts/build.sh for simd_ext1/.../simd_extN with comp1/.../compN
```
For each combination a directory `build-simd_ext-comp` will be created and
will contain the library. Supported SIMD extension are:
- sse2
- sse42
- avx
- avx2
- avx512\_knl
- avx512\_skylake
- neon128
- aarch64
- sve
- sve128
- sve256
- sve512
- sve1024
- sve2048
- vmx
- vsx
- cuda
- rocm
Supported compiler are:
- gcc
- clang
- icc
- armclang
- xlc
- dpcpp
- fcc
- cl
- nvcc
- hipcc
Note that certain combination of SIMD extension/compilers are not supported
such as aarch64 with icc, or avx512\_skylake with nvcc.
## Build on Windows
Make sure you are typing in a Visual Studio prompt. The command is almost the
same as for Linux with the same constraints on the pairs SIMD
extension/compilers.
```batch
scripts\build.bat for simd_ext1/.../simd_extN with comp1/.../compN
```
## More details on building the library
The library uses a tool called nsconfig
(<https://github.com/agenium-scale/nstools>) which is basically a Makefile
translator. If you have just built NSIMD following what's described above
you should have a `nstools` directory which contains `bin/nsconfig`. If not
you can generate it using on Linux
```bash
bash scripts/setup.sh
```
and on Windows
```batch
scripts\setup.bat
```
Then you can use `nsconfig` directly it has a syntax similar to CMake at
command line. Here is a quick tutorial with Linux command line. We first
go to the NSIMD directory and generate both NSIMD and nsconfig.
```bash
$ cd nsimd
$ python3 egg/hatch.py -ltf
$ bash scripts/setup.sh
$ mkdir build
$ cd build
```
Help can be displayed using `--help`.
```bash
$ ../nstools/bin/nsconfig --help
usage: nsconfig [OPTIONS]... DIRECTORY
Configure project for compilation.
-v verbose mode, useful for debugging
-nodev Build system will never call nsconfig
-DVAR=VALUE Set value of variable VAR to VALUE
-list-vars List project specific variable
-GBUILD_SYSTEM Produce files for build system BUILD_SYSTEM
Supported BUILD_SYSTEM:
make POSIX Makefile
gnumake GNU Makefile
nmake Microsot Visual Studio NMake Makefile
ninja Ninja build file (this is the default)
list-vars List project specific variables
-oOUTPUT Output to OUTPUT instead of default
-suite=SUITE Use compilers from SUITE as default ones
Supported SUITE:
gcc The GNU compiler collection
msvc Microsoft C and C++ compiler
llvm The LLVM compiler infrastructure
armclang Arm suite of compilers based on LLVM
xlc IBM suite of compilers
fcc_trad_mode
Fujitsu compiler in traditional mode
fcc_clang_mode
Fujitsu compiler in clang mode
emscripten
Emscripten suite for compiling into JS
icc Intel C amd C++ compiler
rocm Radeon Open Compute compilers
oneapi Intel oneAPI compilers
cuda, cuda+gcc, cuda+clang, cuda+msvc
Nvidia CUDA C++ compiler
-comp=COMMAND,COMPILER[,PATH[,VERSION[,ARCHI]]]
Use COMPILER when COMMAND is invoked for compilation
If VERSION and/or ARCHI are not given, nsconfig will
try to determine those. This is useful for cross
compiling and/or setting the CUDA host compiler.
COMMAND must be in { cc, c++, gcc, g++, cl, icc, nvcc,
hipcc, hcc, clang, clang++, armclang, armclang++,
cuda-host-c++, emcc, em++ } ;
VERSION is compiler dependant. Note that VERSION
can be set to only major number(s) in which case
nsconfig fill missing numbers with zeros.
Supported ARCHI:
x86 Intel 32-bits ISA
x86_64 Intel/AMD 64-bits ISA
armel ARMv5 and ARMv6 32-bits ISA
armhf ARMv7 32-bits ISA
aarch64 ARM 64-bits ISA
ppc64el PowerPC 64-bits little entian
wasm32 WebAssembly with 32-bits memory indexing
wasm64 WebAssembly with 64-bits memory indexing
Supported COMPILER:
gcc, g++ GNU Compiler Collection
clang, clang++ LLVM Compiler Infrastructure
emcc, em++ Emscripten compilers
msvc, cl Microsoft Visual C++
armclang, armclang++ ARM Compiler
xlc, xlc++ IBM Compiler
icc Intel C/C++ Compiler
dpcpp Intel DPC++ Compiler
nvcc Nvidia CUDA compiler
hipcc ROCm HIP compiler
fcc_trad_mode, FCC_trad_mode
Fujitsu C and C++ traditionnal
compiler
fcc_clang_mode, FCC_clang_mode
Fujitsu C and C++ traditionnal
compiler
-prefix=PREFIX Set path for installation to PREFIX
-h, --help Print the current help
NOTE: Nvidia CUDA compiler (nvcc) needs a host compiler. Usually on
Linux systems it is GCC while on Windows systems it is MSVC.
If nvcc is chosen as the default C++ compiler via the -suite
switch, then its host compiler can be invoked in compilation
commands with 'cuda-host-c++'. The latter defaults to GCC on Linux
systems and MSVC on Windows systems. The user can of course choose
a specific version and path of this host compiler via the
'-comp=cuda-host-c++,... parameters. If nvcc is not chosen as the
default C++ compiler but is used for compilation then its default
C++ host compiler is 'c++'. The latter can also be customized via
the '-comp=c++,...' command line switch.
```
Each project can defined its own set of variable controlling the generation of
the ninja file of Makefile.
```bash
$ ../nstools/bin/nsconfig .. -list-vars
Project variables list:
name | description
-----------------|-----------------------------------
simd | SIMD extension to use
cuda_arch_flags | CUDA target arch flag(s) for tests
static_libstdcpp | Compile the libstdc++ statically
cpp20_tests | Enable C++20 tests
```
Finally one can choose what to do and compile NSIMD and its tests.
```bash
$ ../nstools/bin/nsconfig .. -Dsimd=avx2
$ ninja
$ ninja tests
```
Nsconfig comes with nstest a small tool to execute tests.
```bash
$ ../nstools/bin/nstest -j20
```
## Cross compilation
It is useful to cross-compile for example when you are on a Intel workstation
and want to compile for a Raspberry Pi. Nsconfig generate some code, compile
and run it to obtain informations on the C or C++ compilers. When cross
compiling, unless you configured your Linux box with binfmt\_misc to
tranparently execute aarch64 binaries on a x86\_64 host you need to give
nsconfig all the informations about the compilers so that it does not need to
run aarch64 code on x86\_64 host.
```bash
$ ../nstools/bin/nsconfig .. -Dsimd=aarch64 \
-comp=cc,gcc,aarch64-linux-gnu-gcc,10.0,aarch64 \
-comp=c++,gcc,aarch64-linux-gnu-g++,10.0,aarch64
```
## Defines that control NSIMD compilation and usage
Several defines control NSIMD.
- `FMA` or `NSIMD_FMA` indicate to NSIMD that fma intrinsics can be used
when compiling code. This is useful on Intel SSE2, SSE42, AVX and AVX2.
- `FP16` or `NSIMD_FP16` indicate to NSIMD that the targeted architecture
natively (and possibly partially) supports IEEE float16's. This is useful
when compiling for Intel SSE2, SSE42, AVX and AVX2, Arm NEON128 and AARCH64.
# Philosophy of NSIMD
Originally the library aimed at providing a portable zero-cost abstraction over
SIMD vendor intrinsics disregarding the underlying SIMD vector length. NSIMD
will of course continue to wrap SIMD intrinsics from various vendors but
more efforts will be put into writing NSIMD modules and improving the existing
ones especially the SPMD module.
## The SPMD paradigm
It is our belief that SPMD is a good paradigm for writing vectorized code. It
helps both the developer and the compiler writer. It forces the developers to
better arrange its data ion memory more suited for vectorization. On the
compiler side it is more simplier to write a "SPMD compiler" than a standard
C/C++/Fortran compiler that tries to autovectorize some weird loop with data
scattered all around the place. Our priority for our SPMD module are the
following:
- Add oneAPI/SYCL support.
- Provide a richer API.
- Provide cross-lane data transfer.
- Provide a way to abstract shared memory.
Our approach can be roughly compared to ISPC (<https://ispc.github.io/>)
but from a library point of view.
## Wrapping intrinsics in NSIMD core
NSIMD was designed following as closely as possible the following guidelines:
- Correctness primes over speed except for corner cases which may include the
following:
+ Buggy intrinsics on rare input values (denormal numbers, infinities,
NaNs) in which case a slower but correct alternative may be
proposed to bypass the buggy intrinsics.
+ A buggy intrinsics but for a specific version of a family of chips. It
would be unreasonable to penalize the majority of users vs. a few (or
even no) users.
- Emulate with tricks and intrinsic integer arithmetic when not available.
- Use common names as found in common computation libraries.
- Do not hide SIMD registers, one variable (of a type such as `nsimd::pack`)
matches one register. When possible force the user to think different between
SIMD code and scalar code.
- Make the life of the compiler as easy as possible: keep the code simple to
allow the compiler to perform as many optimizations as possible.
- Favor the advanced C++ API.
You may wrap intrinsics that require compile time knowledge of the underlying
vector length but this should be done with caution.
Wrapping intrinsics that do not exist for all types is difficult and may
require casting or emulation. For instance, 8 bit integer vector multiplication
using SSE2 does not exist. We can either process each pair of integers
individually or we can cast the 8 bit vectors to 16 bit vectors, do the
multiplication and cast them back to 8 bit vectors. In the second case,
chaining operations will generate many unwanted casts.
To avoid hiding important details to the user, overloads of operators involving
scalars and SIMD vectors are not provided by default. Those can be included
explicitely to emphasize the fact that using expressions like `scalar + vector`
might incur an optimization penalty.
The use of `nsimd::pack` may not be portable to ARM SVE and therefore must be
included manually. ARM SVE registers can only be stored in sizeless strucs
(`__sizeless_struct`). This feature (as of 2019/04/05) is only supported by the
ARM compiler. We do not know whether other compilers will use the same keyword
or paradigm to support SVE intrinsics.
# Contributing to NSIMD
The wrapping of intrinsics, the writing of test and bench files are tedious and
repetitive tasks. Most of those are generated using Python scripts that can be
found in `egg`.
- Intrinsics that do not require to known the vector length can be wrapped and
will be accepted with no problem.
- Intrinsics that do require the vector length at compile time can be wrapped
but it is up to the maintainer to accept it.
- Use `clang-format` when writing C or C++ code.
- The `.cpp` files are written in C++98.
- The headers files must be compatible with C89 (when possible otherwise
C99), C++98, C++11, C++14 up to and including C++20.
Please see <doc/markdown/CONTRIBUTE.md> for more details.
# LICENSES
NSIMD contains files from the excellent [Sleef library](https://sleef.org/)
whose license is stated below. The corresponding files are all located
in the `src` folder and have retained their original license notices.
## NSIMD license
Copyright (c) 2021 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
## Sleef license ([Boost Software License v1.0](https://www.boost.org/LICENSE_1_0.txt))
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
================================================
FILE: benches/benches.hpp
================================================
#ifndef BENCHES_HPP
#define BENCHES_HPP
#include <limits>
#include <cmath>
#include <climits>
namespace nsimd {
namespace benches {
template <typename T>
double rand_sign() {
if (std::is_unsigned<T>::value) {
return 1.;
} else {
return (::rand() % 2) ? 1. : -1.;
}
}
template <typename T>
T rand_bits(T min, T max = std::numeric_limits<T>::max()) {
T r;
do {
int nbits = sizeof(T) * CHAR_BIT;
u64 x = 0;
for (int i = 0; i < nbits; ++i) {
x |= u64(::rand() % 2) << i;
}
r = *((T*)&x);
} while (r < min || r > max);
return r;
}
template <typename T>
T rand_from(T min, T max = std::numeric_limits<T>::max()) {
// From: http://c-faq.com/lib/randrange.html
return T(double(min)
+ (double(::rand()) / (double(RAND_MAX) / (double(max) - double(min) + 1))));
}
template <typename T>
T rand_fp(T min, T max) {
T r;
if (std::isinf(min) && std::isinf(max)) {
// For now, we're not using this method for random number
//r = rand_bits<T>(min, max);
r = rand_from<T>(-1000000, 1000000);
} else {
r = rand_from<T>(min, max);
}
return r;
}
template <typename T>
T rand(T min, T max = std::numeric_limits<T>::max()) {
return rand_from<T>(min, max);
}
template <>
float rand<float>(float min, float max) {
return rand_fp<float>(min, max);
}
template <>
double rand<double>(double min, double max) {
return rand_fp<double>(min, max);
}
}
}
#endif
================================================
FILE: build.nsconfig
================================================
# MIT License
#
# Copyright (c) 2021 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
package_name nsimd-3.0
## ----------------------------------------------------------------------------
## Get OS/Compiler specific file extensions
set o = @obj_ext
set exe = @exe_ext
set s = @asm_ext
set so = @shared_lib_ext
set lib = @shared_link_ext
set root = @source_dir
set make = @make_command
set build = @build_dir
set root = @source_dir
set ccomp = @ccomp_name
set cppcomp = @cppcomp_name
## ----------------------------------------------------------------------------
## Some defaults
ifnot_set "SIMD extension to use" simd = cpu
ifnot_set "CUDA target arch flag(s) for tests" cuda_arch_flags = ""
ifnot_set "Compile the libstdc++ statically" static_libstdcpp = true
ifnot_set "Enable C++20 tests" cpp20_tests = ""
## ----------------------------------------------------------------------------
## Targets for compilation
set o_for_ = fp16$o memory$o ufp$o api_cpu$o rempitab$o \
sleefsp$o sleefdp$o gpu$o
set o_for_cpu = $o_for_
set o_for_cuda = $o_for_
set o_for_rocm = $o_for_
set o_for_oneapi = $o_for_
set o_for_sse2 = $o_for_cpu api_sse2$o sleef_sse2_f32$o \
sleef_sse2_f64$o
set o_for_sse42 = $o_for_sse2 api_sse42$o sleef_sse42_f32$o \
sleef_sse42_f64$o
set o_for_avx = $o_for_sse42 api_avx$o sleef_avx_f32$o \
sleef_avx_f64$o
set o_for_avx2 = $o_for_avx api_avx2$o sleef_avx2_f32$o \
sleef_avx2_f64$o
set o_for_avx512_knl = $o_for_avx2 api_avx512_knl$o \
sleef_avx512_knl_f32$o sleef_avx512_knl_f64$o
set o_for_avx512_skylake = $o_for_avx2 api_avx512_skylake$o \
sleef_avx512_skylake_f32$o \
sleef_avx512_skylake_f64$o
set o_for_neon128 = $o_for_cpu api_neon128$o sleef_neon128_f32$o \
sleef_neon128_f64$o
set o_for_aarch64 = $o_for_cpu api_aarch64$o sleef_aarch64_f32$o \
sleef_aarch64_f64$o
set o_for_sve = $o_for_aarch64 api_sve$o sleef_sve_f32$o \
sleef_sve_f64$o
set o_for_sve128 = $o_for_aarch64 api_sve128$o sleef_sve128_f32$o \
sleef_sve128_f64$o
set o_for_sve256 = $o_for_aarch64 api_sve256$o sleef_sve256_f32$o \
sleef_sve256_f64$o
set o_for_sve512 = $o_for_aarch64 api_sve512$o sleef_sve512_f32$o \
sleef_sve512_f64$o
set o_for_sve1024 = $o_for_aarch64 api_sve1024$o sleef_sve1024_f32$o \
sleef_sve1024_f64$o
set o_for_sve2048 = $o_for_aarch64 api_sve2048$o sleef_sve2048_f32$o \
sleef_sve2048_f64$o
set o_for_vmx = $o_for_cpu api_vmx$o sleef_vmx_f32$o sleef_vmx_f64$o
set o_for_vsx = $o_for_vmx api_vsx$o sleef_vsx_f32$o sleef_vsx_f64$o
## ----------------------------------------------------------------------------
## SIMD compiler flags
lambda cflags_for_generic_* = -DCPU
set cflags_for_generic_cuda = -DCUDA
set cflags_for_generic_rocm = -DROCM
set cflags_for_generic_oneapi = -DONEAPI
set cflags_for_ = ${cflags_for_generic_$simd$}
set cflags_for_cpu = $cflags_for_
set cflags_for_cuda = -DCUDA
set cflags_for_rocm = -DROCM
set cflags_for_oneapi = -DONEAPI
set cflags_for_sse2 = -DSSE2 -msse2
set cflags_for_sse42 = -DSSE42 -msse42
set cflags_for_avx = -DAVX -mavx
set cflags_for_avx2 = -DAVX2 -mavx2 -DFMA -mfma -DFP16 -mfp16
set cflags_for_avx512_knl = -DAVX512_KNL -mavx512_knl -mfma -DFP16 -mfp16
set cflags_for_avx512_skylake = -DAVX512_SKYLAKE -mavx512_skylake -mfma \
-DFP16 -mfp16
set cflags_for_neon128 = -DNEON128 -mneon128
set cflags_for_aarch64 = -DAARCH64 -maarch64
set cflags_for_sve = -DSVE -msve
set cflags_for_sve128 = -DSVE128 -msve128
set cflags_for_sve256 = -DSVE256 -msve256
set cflags_for_sve512 = -DSVE512 -msve512
set cflags_for_sve1024 = -DSVE1024 -msve1024
set cflags_for_sve2048 = -DSVE2048 -msve2048
set cflags_for_vmx = -DVMX -mvmx
set cflags_for_vsx = -DVSX -mvsx
## ----------------------------------------------------------------------------
## std default flag
lambda std_flag_for_* = -std=c++98
set std_flag_for_rocm = -std=c++11
set std_flag_for_oneapi = -std=c++17
## ----------------------------------------------------------------------------
## libstdc++ linking mode
set libstdcpp_static_link_true = -static-libstdc++
set libstdcpp_static_link_false =
## ----------------------------------------------------------------------------
## Some defaults
set flags = -Wall -fPIC -O2 -I$root$/include -DNDEBUG
set cflags = ${std_flag_for_$simd$} $flags \
${libstdcpp_static_link_$static_libstdcpp$}
set sleef_cflags = -fPIC -O2 -I$root$/src -DNDEBUG -DDORENAME=1
## ----------------------------------------------------------------------------
## Default building rules
phony all deps libnsimd_$simd$$so$
build_file libnsimd_$simd$$so deps ${o_for_$simd$}
c++ -fPIC -shared @in -o @out
set ldflags = -fPIC -L. -lnsimd_$simd
## ----------------------------------------------------------------------------
## Generic (emulation) rules for building
build_file gpu$o autodeps $root$/src/gpu.cpp
c++ $cflags$ $cflags_for_cpu @in -c -o @out
build_file ufp$o autodeps $root$/src/ufp.cpp
c++ $cflags$ $cflags_for_cpu @in -c -o @out
build_file fp16$o autodeps $root$/src/fp16.cpp
c++ $cflags$ $cflags_for_cpu @in -c -o @out
build_file memory$o autodeps $root$/src/memory.cpp
c++ $cflags$ $cflags_for_cpu @in -c -o @out
build_file rempitab$o autodeps $root$/src/rempitab.c
cc $sleef_cflags$ -c @in -o @out
build_file sleefsp$o autodeps $root$/src/sleefsp.c
cc $sleef_cflags$ -c @in -o @out
build_file sleefdp$o autodeps $root$/src/sleefdp.c
cc $sleef_cflags$ -c @in -o @out
build_file api_cpu$o autodeps $root$/src/api_cpu.cpp
c++ $cflags$ $cflags_for_cpu -c @in -o @out
## ----------------------------------------------------------------------------
## Intel rules for building
build_file api_sse2$o autodeps $root$/src/api_sse2.cpp
c++ $cflags$ -c $cflags_for_sse2 @in -o @out
build_file sleef_sse2_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out
build_file sleef_sse2_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out
build_file api_sse42$o autodeps $root$/src/api_sse42.cpp
c++ $cflags$ -c $cflags_for_sse42 @in -o @out
build_file sleef_sse42_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out
build_file sleef_sse42_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out
build_file api_avx$o autodeps $root$/src/api_avx.cpp
c++ $cflags$ -c $cflags_for_avx @in -o @out
build_file sleef_avx_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out
build_file sleef_avx_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out
build_file api_avx2$o autodeps $root$/src/api_avx2.cpp
c++ $cflags$ -c $cflags_for_avx2 @in -o @out
build_file sleef_avx2_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \
@in -o @out
build_file sleef_avx2_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \
@in -o @out
build_file api_avx512_knl$o autodeps $root$/src/api_avx512_knl.cpp
c++ $cflags$ -c $cflags_for_avx512_knl @in -o @out
build_file sleef_avx512_knl_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \
-DENABLE_AVX512F=1 @in -o @out
build_file sleef_avx512_knl_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \
-DENABLE_AVX512F=1 @in -o @out
build_file api_avx512_skylake$o autodeps $root$/src/api_avx512_skylake.cpp
c++ $cflags$ -c $cflags_for_avx512_skylake @in -o @out
build_file sleef_avx512_skylake_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \
-DENABLE_AVX512F=1 @in -o @out
build_file sleef_avx512_skylake_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \
-DENABLE_AVX512F=1 @in -o @out
## ----------------------------------------------------------------------------
## ARM 32 bits rules for building
build_file api_neon128$o autodeps $root$/src/api_neon128.cpp
c++ $cflags$ -c $cflags_for_neon128 @in -o @out
build_file sleef_neon128_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 \
-DENABLE_NEON32=1 @in -o @out
build_file sleef_neon128_f64$o autodeps $root$/src/sleefsimddp_emulation.c
cc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 -DENABLE_NEON32=1 \
-I$root$/include @in -o @out
## ----------------------------------------------------------------------------
## ARM 64 bits rules for building
build_file api_aarch64$o autodeps $root$/src/api_aarch64.cpp
c++ $cflags$ -c $cflags_for_aarch64 @in -o @out
build_file sleef_aarch64_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \
-DENABLE_ADVSIMD=1 @in -o @out
build_file sleef_aarch64_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \
-DENABLE_ADVSIMD=1 @in -o @out
build_file api_sve$o autodeps $root$/src/api_sve.cpp
c++ $cflags$ -c $cflags_for_sve @in -o @out
build_file sleef_sve_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out
build_file sleef_sve_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out
build_file api_sve128$o autodeps $root$/src/api_sve128.cpp
c++ $cflags$ -c $cflags_for_sve128 @in -o @out
build_file sleef_sve128_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out
build_file sleef_sve128_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out
build_file api_sve256$o autodeps $root$/src/api_sve256.cpp
c++ $cflags$ -c $cflags_for_sve256 @in -o @out
build_file sleef_sve256_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out
build_file sleef_sve256_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out
build_file api_sve512$o autodeps $root$/src/api_sve512.cpp
c++ $cflags$ -c $cflags_for_sve512 @in -o @out
build_file sleef_sve512_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out
build_file sleef_sve512_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out
build_file api_sve1024$o autodeps $root$/src/api_sve1024.cpp
c++ $cflags$ -c $cflags_for_sve1024 @in -o @out
build_file sleef_sve1024_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \
@in -o @out
build_file sleef_sve1024_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \
@in -o @out
build_file api_sve2048$o autodeps $root$/src/api_sve2048.cpp
c++ $cflags$ -c $cflags_for_sve2048 @in -o @out
build_file sleef_sve2048_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \
@in -o @out
build_file sleef_sve2048_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \
@in -o @out
## ----------------------------------------------------------------------------
## POWERPC rules for building
build_file api_vmx$o autodeps $root$/src/api_vmx.cpp
c++ $cflags$ -c $cflags_for_vmx @in -o @out
build_file sleef_vmx_f32$o autodeps $root$/src/sleefsimdsp_emulation.c
cc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \
-I$root$/include @in -o @out
build_file sleef_vmx_f64$o autodeps $root$/src/sleefsimddp_emulation.c
cc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \
-I$root$/include @in -o @out
build_file api_vsx$o autodeps $root$/src/api_vsx.cpp
c++ $cflags$ -c $cflags_for_vsx @in -o @out
build_file sleef_vsx_f32$o autodeps $root$/src/sleefsimdsp.c
cc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out
build_file sleef_vsx_f64$o autodeps $root$/src/sleefsimddp.c
cc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out
## ----------------------------------------------------------------------------
## Installation and packaging
install_file libnsimd_${simd}$so lib
[W] install_file libnsimd_${simd}$lib lib
install_dir $root$/include/nsimd include
install_dir $root$/doc/html doc
## ----------------------------------------------------------------------------
## Tests
# Lambda arguments: suite, compiler, std, simd_ext
# By default all tests will be considered
lambda tests_*_*_* = ok
# Now disable some possibilities on certain compilers
set tests_clang_c89_vmx = ""
set tests_clang_c89_vsx = ""
set tests_clang_c89_sve = ""
lambda tests_*_c89_cuda = ""
lambda tests_*_c99_cuda = ""
lambda tests_*_c11_cuda = ""
lambda tests_*_cpp17_cuda = ""
lambda tests_*_c89_rocm = ""
lambda tests_*_c99_rocm = ""
lambda tests_*_c11_rocm = ""
lambda tests_*_cpp98_rocm = ""
lambda tests_*_cpp17_rocm = ""
lambda tests_*_c89_oneapi = ""
lambda tests_*_c99_oneapi = ""
lambda tests_*_c11_oneapi = ""
lambda tests_dpcpp_cpp98_* = ""
lambda tests_dpcpp_cpp11_* = ""
set c89_enabled = ${tests_$ccomp$_c89_$simd$}
set c89.files = ""
set c99_enabled = ${tests_$ccomp$_c99_$simd$}
set c99.files = ""
set c11_enabled = ${tests_$ccomp$_c11_$simd$}
set c11.files = ""
set cpp98_enabled = ${tests_$cppcomp$_cpp98_$simd$}
set cpp98.files = ""
set cpp11_enabled = ${tests_$cppcomp$_cpp11_$simd$}
set cpp11.files = ""
set cpp17_enabled = ${tests_$cppcomp$_cpp17_$simd$}
set cpp17.files = ""
set cpp20.files = ""
set tests_flags = $cuda_arch_flags $flags ${cflags_for_$simd$} -lm $ldflags
echo Test compilation flags: $tests_flags$
[$c89_enabled$] build_files c89 foreach glob:$root$/tests/*.prec11.c \
as tests.%r.c89$exe \
autodeps @item libnsimd_$simd$$so$
[$c89_enabled$] cc -std=c89 @item $tests_flags -o @out
[$c89_enabled$] phony tests.c89 deps $c89.files
[$c99_enabled$] build_files c99 foreach glob:$root$/tests/*.prec11.c \
as tests.%r.c99$exe \
autodeps @item libnsimd_$simd$$so$
[$c99_enabled$] cc -std=c99 @item $tests_flags -o @out
[$c99_enabled$] phony tests.c99 deps $c99.files
[$c11_enabled$] build_files c11 foreach glob:$root$/tests/*.c \
as tests.%r.c11$exe \
autodeps @item libnsimd_$simd$$so$
[$c11_enabled$] cc -std=c11 @item $tests_flags -o @out
[$c11_enabled$] phony tests.c11 deps $c11.files
[$cpp98_enabled$] build_files cpp98 foreach glob:$root$/tests/*.cpp \
as tests.%r.cpp98$exe \
autodeps @item libnsimd_$simd$$so$
[$cpp98_enabled$] c++ -std=c++98 @item $tests_flags -o @out
[$cpp98_enabled$] phony tests.cpp98 deps $cpp98.files
[$cpp11_enabled$] build_files cpp11 foreach glob:$root$/tests/*.cpp \
as tests.%r.cpp11$exe \
autodeps @item libnsimd_$simd$$so$
[$cpp11_enabled$] c++ -std=c++11 @item $tests_flags -o @out
[$cpp11_enabled$] phony tests.cpp11 deps $cpp11.files
[$cpp17_enabled$] build_files cpp17 foreach glob:$root$/tests/*.cpp \
as tests.%r.cpp17$exe \
autodeps @item libnsimd_$simd$$so$
[$cpp17_enabled$] c++ -std=c++17 @item $tests_flags -o @out
[$cpp17_enabled$] phony tests.cpp17 deps $cpp17.files
[$cpp20_tests$] build_files cpp20 foreach glob:$root$/tests/*.cpp \
as tests.%r.cpp20$exe \
autodeps @item libnsimd_$simd$$so$
[$cpp20_tests$] c++ -std=c++20 @item $tests_flags -o @out
[$cpp20_tests$] phony tests.cpp20 deps $cpp20.files
# Phony target for tests
phony tests deps $c89.files $c99.files $c11.files $cpp98.files $cpp11.files \
$cpp17.files $cpp20.files
## ----------------------------------------------------------------------------
## Examples
build_files examples_cpp98 foreach glob:$root$/examples/*.cpp \
as examples.%r.cpp98$exe \
autodeps @item libnsimd_$simd$$so$
c++ -std=c++98 @item $tests_flags -o @out
phony examples.cpp98 deps $examples_cpp98.files
================================================
FILE: doc/Makefile.nix
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
NS2_ROOT = ../nstools/ns2
CXX = c++
CXX_FLAGS = -O2 -Wall -Wextra -pedantic -std=c++11
all: md2html what_is_wrapped
libns2.a: $(NS2_ROOT)/../.git/logs/HEAD Makefile.nix
rm -rf libns2
mkdir -p libns2
cp $(NS2_ROOT)/lib/*.cpp libns2
(cd libns2 && $(CXX) $(CXX_FLAGS) -I../$(NS2_ROOT)/include -c *.cpp)
ar rcs $@ libns2/*.o
rm -rf libns2
md2html: libns2.a md2html.cpp Makefile.nix
$(CXX) $(CXX_FLAGS) md2html.cpp -I$(NS2_ROOT)/include -o $@ -L. -lns2
what_is_wrapped: libns2.a what_is_wrapped.cpp Makefile.nix
$(CXX) $(CXX_FLAGS) what_is_wrapped.cpp -I$(NS2_ROOT)/include -o $@ \
-L. -lns2
================================================
FILE: doc/Makefile.win
================================================
# Copyright (c) 2020 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
NS2_ROOT = ..\nstools\ns2
CXX = cl
CXX_FLAGS = /nologo /Ox /W3 /EHsc /DNS_NO_DLLSPEC /D_CRT_SECURE_NO_WARNINGS
all: md2html.exe what_is_wrapped.exe
libns2.lib: $(NS2_ROOT)\..\.git\logs\HEAD Makefile.win
if exist libns2 rd /Q /S libns2
md libns2
copy /Y $(NS2_ROOT)\lib\*.cpp libns2
(cd libns2 && $(CXX) $(CXX_FLAGS) -I..\$(NS2_ROOT)\include /c *.cpp)
lib /nologo /out:libns2.lib libns2\*.obj
rd /Q /S libns2
md2html.exe: libns2.lib md2html.cpp Makefile.win
$(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\include md2html.cpp libns2.lib \
Shlwapi.lib Dbghelp.lib /Fe$@
what_is_wrapped.exe: libns2.lib what_is_wrapped.cpp Makefile.win
$(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\include what_is_wrapped.cpp \
libns2.lib Shlwapi.lib Dbghelp.lib /Fe$@
================================================
FILE: doc/markdown/compilers_and_versions.md
================================================
<!--
Copyright (c) 2019 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
`nsimd` is tested with GCC, Clang and MSVC. As a C89 and a C++98 API are
provided, other compilers should work fine. Old compiler versions should work as
long as they support the targeted SIMD extension. For instance, `nsimd` can
compile on MSVC 2010 `SSE4.2` code.
`nsimd` requires a C or a C++ compiler and is actually daily tested on the
following compilers for the following hardware:
**Compiler** | **Version** | **Architecture** | **Extensions**
----------------------- | ----------- | ---------------- | --------------
GCC | 8.3.0 | Intel | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`)
Clang | 7.0.1 | Intel | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`)
GCC | 8.3.0 | ARM | `Aarch64`, `NEON` (`ARMv7`), `SVE`
Clang | 7.0.1 | ARM | `Aarch64`, `NEON` (`ARMv7`), `SVE`
Microsoft Visual Studio | 2017 | Intel | `SSE4.2`
Intel C++ Compiler | 19.0.4.243 | Intel | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`SKYLAKE`)
<!-- TODO -->
<!--We recommend using a 64-bits compiler as this results in significantly better
performance. Also, `nsimd` performances are only provided when compiled in an
optimized code with assertions disabled.-->
================================================
FILE: doc/markdown/concepts.md
================================================
# C++20 concepts
As of C++20, concepts are available. We quote <en.cppreference.com> to
introduce concepts.
*Class templates, function templates, and non-template functions (typically
members of class templates) may be associated with a constraint, which
specifies the requirements on template arguments, which can be used to select
the most appropriate function overloads and template specializations.*
*Named sets of such requirements are called concepts. Each concept is a
predicate, evaluated at compile time, and becomes a part of the interface of a
template where it is used as a constraint*
## Concepts provided by NSIMD
All concepts provided by NSIMD comes in two forms:
- The native C++20 form in the `nsimd` namespace
- As a macro for keeping the compatibility with older versions of C++
The following tables list all concepts and is exhaustive. Native concepts are
accessible through the `nsimd` namespace. They take only one argument. Their
macro counterparts take no argument as they are meant to be used as
constraint placeholder types. When compiling for older C++ versions NSIMD
concepts macros are simply read as `typename` by the compiler.
Table for base C and C++ APIs:
| Native concept | Macro | Description |
|:----------------------------|:-----------------------------------|:-----------------------------------------------|
| `simd_ext_c` | `NSIMD_CONCEPT_SIMD_EXT` | Valid SIMD extension |
| `simd_value_type_c` | `NSIMD_CONCEPT_VALUE_TYPE` | Valid NSIMD underlying value type |
| `simd_value_type_or_bool_c` | `NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL` | Valid NSIMD underlying value type or `bool` |
| `alignment_c` | `NSIMD_CONCEPT_ALIGNMENT` | Valid NSIMD alignment `aligned` or `unaligned` |
Table for advanced C++ API:
| Native concept | Macro | Description |
|:---------------|:-------------------------|:----------------------|
| `is_pack_c` | `NSIMD_CONCEPT_PACK` | Valid NSIMD pack |
| `is_packl_c` | `NSIMD_CONCEPT_PACKL` | Valid NSIMD packl |
| `is_packx1_c` | `NSIMD_CONCEPT_PACKX1` | Valid NSIMD packx1 |
| `is_packx2_c` | `NSIMD_CONCEPT_PACKX2` | Valid NSIMD packx2 |
| `is_packx3_c` | `NSIMD_CONCEPT_PACKX3` | Valid NSIMD packx3 |
| `is_packx4_c` | `NSIMD_CONCEPT_PACKX4` | Valid NSIMD packx4 |
| `any_pack_c` | `NSIMD_CONCEPT_ANY_PACK` | Any of the above pack |
## Expressing C++20 constraints
Expressing constraints can of course be done with the `requires` keyword. But
for compatibility with older C++ versions NSIMD provides `NSIMD_REQUIRES`
which take as onyl argument the constraints.
```c++
template <typename T, typename S>
NSIMD_REQUIRES(sizeof(T) == sizeof(S))
void foo(T, S);
```
It is advised to use doubled parenthesis as coma in the constraints expression
can be interpreted as argument separators for the macro itself.
```c++
template <typename T, typename S>
NSIMD_REQUIRES((std::is_same<T, S>))
void foo(T, S);
```
Note that when expressing constraints using `nsimd::sizeof_v`'s prefer the
NSIMD definition of sizeof for the following reason: when dealing with
float16's one cannot know the underlying representation of such a type as it
is non-portable and non-standard, but NSIMD provides helper functions to
transparently deal with float16's as if they were 16-bits wide. Therefore
expressing sizeof equality should be done with `nsimd::sizeof_v`.
```c++
template <typename T, typename S>
NSIMD_REQUIRES((nsimd::sizeof_v<T> == nsimd::sizeof_v<S>))
void foo(T, S);
```
================================================
FILE: doc/markdown/defines.md
================================================
# Defines provided by NSIMD
NSIMD uses macros (not function macros) that we call defines to make choices
in its code at copmile time. Most of them can be of use to the end-user so
we list them here.
## Compiler detection
The compiler detection is automatically done by NSIMD as it is relatively
easy.
| Define | Compiler |
|---------------------|---------------------------------------------------|
| `NSIMD_IS_MSVC` | Microsoft Visual C++ |
| `NSIMD_IS_HIPCC` | ROCm HIP compiler (warning, see below) |
| `NSIMD_IS_NVCC` | NVIDIA CUDA Compiler |
| `NSIMD_IS_ICC` | Intel C++ Compiler |
| `NSIMD_IS_CLANG` | Clang/LLVM |
| `NSIMD_IS_GCC` | GNU Compiler Collection |
| `NSIMD_IS_FCC` | Fujitsu compiler |
**Warning**: some HIP versions do not declare themselves at all so it
impossible to find out that HIP is the compiler. As HIP is based on clang,
without help NSIMD will detect Clang. It is up to the end-user to compile
with `-D__HIPCC__` for NSIMD to detect HIP.
Note that we do support the Armclang C and C++ compilers but for NSIMD there
is no need to have code different from Clang's specific code so we do no
provide a macro to detect this compiler in particular.
Note also that two of the above macros can be defined at the same time. This
happens typically when compiling for a device. For example when compiling for
NVIDIA CUDA with nvcc both `NSIMD_IS_NVCC` and `NSIMD_IS_GCC` (when the host
compiler is GCC).
## Compilation environment and contants
| Define | Description | Possible values |
|-------------------|-----------------------|---------------------------------|
| `NSIMD_C` | C version | 1989, 1999, 2011 |
| `NSIMD_CXX` | C++ version | 1998, 2011, 2014, 2017, 2020 |
| `NSIMD_WORD_SIZE` | Machine word size | 32, 64 |
| `NSIMD_U8_MIN` | Minimum value for u8 | 0 |
| `NSIMD_U8_MAX` | Maximum value for u8 | 255 |
| `NSIMD_I8_MIN` | Minimum value for i8 | -128 |
| `NSIMD_I8_MAX` | Maximum value for i8 | 127 |
| `NSIMD_U16_MIN` | Minimum value for u16 | 0 |
| `NSIMD_U16_MAX` | Maximum value for u16 | 65535 |
| `NSIMD_I16_MIN` | Minimum value for i16 | -32768 |
| `NSIMD_I16_MAX` | Maximum value for i16 | 32767 |
| `NSIMD_U32_MIN` | Minimum value for u32 | 0 |
| `NSIMD_U32_MAX` | Maximum value for u32 | 4294967295 |
| `NSIMD_I32_MIN` | Minimum value for i32 | -2147483648 |
| `NSIMD_I32_MAX` | Maximum value for i32 | 2147483647 |
| `NSIMD_U64_MIN` | Minimum value for u64 | 0 |
| `NSIMD_U64_MAX` | Maximum value for u64 | 18446744073709551615 |
| `NSIMD_I64_MIN` | Minimum value for i64 | -9223372036854775808 |
| `NSIMD_I64_MAX` | Maximum value for i64 | 9223372036854775807 |
| `NSIMD_DLLSPEC` | (Windows) DLL storage-class information | `__declspec(dllexport)` or `__declspec(dllimport)` |
| `NSIMD_DLLSPEC` | (Unix) storage-class information | `extern` or nothing |
| `NSIMD_C_LINKAGE_FOR_F16` | Indicate whether functions involving f16 have C linkage | defined or not |
## Targeted architecture detection
Contrary to the compiler detection, the targeted architecture is not done
autoamtically by NSIMD as is really hard and some compilers do not provide
the necessary informations. So in order to have a consistent way of targeting
an architecture this is up to the end-user to specify it using one of the
following defines.
| Define | Targeted architecture |
|------------------------|---------------------------------------------------|
| `NSIMD_CPU` | Generic, no SIMD, emulation |
| `NSIMD_SSE2` | Intel SSE2 |
| `NSIMD_SSE42` | Intel SSE4.2 |
| `NSIMD_AVX` | Intel AVX |
| `NSIMD_AVX2` | Intel AVX2 |
| `NSIMD_AVX512_KNL` | Intel AVX-512 as found on KNLs |
| `NSIMD_AVX512_SKYLAKE` | Intel AVX-512 as found on Xeon Skylake |
| `NSIMD_NEON128` | Arm NEON 128 bits as found on 32-bits Arm chips |
| `NSIMD_AARCH64` | Arm NEON 128 bits as found on 64-bits Arm chips |
| `NSIMD_SVE` | Arm SVE (length agnostic) |
| `NSIMD_SVE128` | Arm SVE (size known at compilation to 128 bits) |
| `NSIMD_SVE256` | Arm SVE (size known at compilation to 256 bits) |
| `NSIMD_SVE512` | Arm SVE (size known at compilation to 512 bits) |
| `NSIMD_SVE1024` | Arm SVE (size known at compilation to 1024 bits) |
| `NSIMD_SVE2048` | Arm SVE (size known at compilation to 2048 bits) |
| `NSIMD_CUDA` | Nvidia CUDA |
| `NSIMD_ROCM` | AMD ROCm architectures |
| `NSIMD_VMX` | IBM POWERPC VMX (Altivec) |
| `NSIMD_VSX` | IBM POWERPC VSX (Altivec) |
| `NSIMD_FP16` | Architecture supports natively IEEE float16 |
| `NSIMD_FMA` | Architecture supports natively FMAs |
## Targeted architecture constants
| Define | Description |
|-----------------------|----------------------------------------------------|
| `NSIMD_NB_REGISTERS` | Number of SIMD registers |
| `NSIMD_MAX_LEN_BIT` | Maximum number of bits in a SIMD register |
| `NSIMD_MAX_LEN_i8` | Maximum number of i8's in a SIMD register |
| `NSIMD_MAX_LEN_u8` | Maximum number of u8's in a SIMD register |
| `NSIMD_MAX_LEN_i16` | Maximum number of i16's in a SIMD register |
| `NSIMD_MAX_LEN_u16` | Maximum number of u16's in a SIMD register |
| `NSIMD_MAX_LEN_i32` | Maximum number of i32's in a SIMD register |
| `NSIMD_MAX_LEN_u32` | Maximum number of u32's in a SIMD register |
| `NSIMD_MAX_LEN_i64` | Maximum number of i64's in a SIMD register |
| `NSIMD_MAX_LEN_u64` | Maximum number of u64's in a SIMD register |
NSIMD provides a mean to write generic code by using the `NSIMD_MAX_LEN` macros
whose argument is one of { i8, u8, i16, u16, i32, u32, i64, u64 }.
```c++
#define T ??? // to be defined as a base type
int main(void) {
T buf[NSIMD_MAX_LEN(T)]; // an array of T's for loading/storing
...
return 0;
}
```
## Other useful macros
NSIMD provides macros to concatenate blobs so that generic programming in pure
C is possible.
- `#define NSIMD_PP_CAT_2(a, b)` concatenates `a` and `b`.
- `#define NSIMD_PP_CAT_3(a, b, c)` concatenates `a`, `b` and `c`.
- `#define NSIMD_PP_CAT_4(a, b, c, d)` concatenates `a`, `b`, `c` and `d`.
- `#define NSIMD_PP_CAT_5(a, b, c, d, e)` concatenates `a`, `b`, `c`, `d` and
`e`.
- `#define NSIMD_PP_CAT_6(a, b, c, d, e, f)` concatenates `a`, `b`, `c`, `d`,
`e` and `f`.
================================================
FILE: doc/markdown/faq.md
================================================
<!--
Copyright (c) 2020 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
# Frequently Asked Questions
## Is it good practice to use a `nsimd::pack` as a `std::vector`?
No, these are two very different objects. A `nsimd::pack` represent a SIMD
register whereas a `std::vector` represents a chunk of memory. You should
separate concerns and use `std::vector` to store data in your structs or
classes, `nsimd::pack` should only be used in computation kernels and nowhere
else especially not in structs or classes.
## Why is the speed-up of my code not as expected?
There are several reasons which can reduce the speed-up:
- Have you enabled compiler optimizations? You must enable all compiler
optimizations (like `-O3`).
- Have you compiled in 64 bit mode? There is significant performance increase
on architectures supporting 64 bit binaries.
- Is your code trivially vectorizable? Modern compilers can vectorize trivial
code segments automatically. If you benchmark a trivial scalar code versus a
vectorized code, the compiler may vectorize the scalar code, thereby giving
similar performance to the vectorized version.
- Some architectures do not provides certains functionnalities. For example
AVX2 chips do not provide a way to convert long to double. So using
`nsimd::cvt<f64>` will produce an emulation for-loop in the resulting
binary. To know which intrinsics are used by NSIMD you can consult
<wrapped_intrinsics.md>.
## Why did my code segfaulted or crashed?
The most common cause of segfaults in SIMD codes is accessing non-aligned
memory. For best performance, all memory should be aligned. NSIMD includes an
aligned memory allocation function and an aligned memory allocator to help you
with this. Please refer to <tutorials.md> for details on how to
ensure that you memory is correctly aligned.
Another common cause is to read or write data beyond the allocated memory.
Do not forget that loading data into a SIMD vector will result in loading
16 bytes (or 4 floats) from memory. If this read occurs at the last 2 elements
of allocated memory then a segfault will be generated.
## My code compiled for AVX is not twice as fast as for SSE, why?
Not all SSE instructions have an equivalent AVX instruction. As a consequence
NSIMD uses two SSE operations to emulate the equivalent AVX operation. Also,
the cycles required for certain instructions are not equal on both
architectures, for example, `sqrt` on `SSE` requires 13-14 cycles whereas
`sqrt` on `AVX` requires 21-28 cycles. Please refer
[here](https://www.agner.org/optimize/instruction_tables.pdf) for more
information.
Very few integer operations are supported on AVX, AVX2 is required for most
integer operations. If a NSIMD function is called on an integer AVX register,
this register will be split into two SSE registers and the equivalent
instruction called on both register. In the case, no speed-up will be observed
compared with SSE code. This is true also on POWER 7, where double is not
supported.
## I disassembled my code, and the generated code is less than optimal, why?
- Have you compiled in release mode, with full optimizations options?
- Have you used a 64 bit compiler?
- There are many SIMD related bugs across all compilers, and some compilers
generate less than optimal code in some cases. Is it possible to update your
compiler to a more modern compiler?
- We provide workarounds for several compiler bugs, however, we may have
missed some. You may also have found a bug in `nsimd`. Please report this
through issues on our github with a minimal code example. We responds quickly
to bug reports and do our best to patch them as quickly as possible.
## How can I use a certain intrinsic?
If you require a certain intrinsic, you may search inside of NSIMD for it and
then call the relevant function or look at <wrapped_intrinsics.md>.
In rare cases, the intrinsic may not be included in NSIMD as we map the
intrinsic wherever it makes sense semantically. If a certain intrinsic does not
fit inside of this model, if may be excluded. In this case, you may call it
yourself, however, note this will not be portable.
To use a particular intrinsic say `_mm_avg_epu8`, you can write the following.
```c++
nsimd::pack<u8> a, b, result;
result = nsimd::pack<u8>(_mm_avg_epu8(a.native_register(),
b.native_register()));
```
## How do I convert integers/floats to/from logicals?
Use [`nsimd::to_mask`](api_to-mask.md) and
[`nsimd::to_logical`](api_to-logical.md).
## How about shuffles?
General shuffles are not provided by NSIMD. You can see
[issue 8 on github](https://github.com/agenium-scale/nsimd/issues/8). For now
we provide only some length agnostic shuffles such as zip and unzip, see
[the shuffle API](api.md) at the Shuffle section.
## Are there C++ STL like algorithms?
No. You are welcome to [contribute](contribute.md) to NSIMD and add them as
a NSIMD module. You should use
[expressions templates](module_tet1d_overview.md) instead. Strictly conforment
STL algorithms do not provide means to control for example the unroll factor
or the number of threads per block when compiling for GPUs.
## Are there masked operators in NSIMD?
Yes, we provide masked loads and stores, see [the api](api.md) at the
"Loads & stores" section. We also provide the
[`nsimd::mask_for_loop_tail`](api_mask-for-loop-tail.md) which computes the
mask for ending loops. But note that using these is not recommanded as on
most architectures there are no intrinsic. This will result in slow code. It
is recommanded to finish loops using a scalar implementation.
## Are there gathers and scatter in NSIMD?
Yes, we provide gathers and scatters, see [the api](api.md) at the
"Loads & stores" section. Note also that as most architectures do not provide
such intrinsics and so this could result in slow code.
## Why does not NSIMD recognize the target architecture automatically?
Autodetecting the SIMD extension is compiler/compiler version/cpu/system
dependant which means a lot of code for a (most likely buggy) feature which can
be an inconvenience sometimes. Plus some compilers do not permit this feature.
For example cf.
<https://www.boost.org/doc/libs/1_71_0/doc/html/predef/reference.html> and
<https://msdn.microsoft.com/en-us/library/b0084kay.aspx>. Thus a "manual"
system is always necessary.
## Why some operators have their names ending with an "1"?
This is because of C++ and our will not to use C++-useless-complicated stuff.
Taking the example with `if_else`, suppose that we have called it "if\_else"
without the "1". When working with packs, one wants to be able to use `if_else`
in this manner:
```c++
int main() {
using namespace nsimd;
typedef pack<int> pi;
typedef pack<float> pf;
int n;
int *a, *b; // suppose both points to n ints
float *fa, *fb; // suppose both points to n floats
for (int i = 0; i < n; i += len()) {
packl<int> cond = (loada<pi>(&a[i]) < loada<pi>(&b[i]));
storea(&fb[i], if_else(cond, load<pf>(&fb[i]), set1<pf>(0.0f)));
}
return 0;
}
```
But this causes a compiler error, the overload of `if_else` is ambiguous.
Sure one can use many C++-ish techniques to tackle this problem but we chose
not to as the goal is to make the life of the compiler as easy as possible.
So as we want to favor the C++ advanced API as it is the most human readable,
users of the C and C++ base APIs will have to use `if_else1`.
================================================
FILE: doc/markdown/fp16.md
================================================
# IEEE float16 related functions
NSIMD natively supports IEEE float16's. This means that NSIMD provides types
and functions to deal with them. When the targeted architecture supports them
then NSIMD will use approriate intrinsics otherwise emulation with float32's
will be used.
- When emulating, as float16's are not natively supported by neither C or C++
emulation is done with float32's.
- Intel architectures do not support IEEE float16 arithmetic, they only
provide, as an extension, supports for convertion to/from float32. When
compiling NSIMD for Intel architectures use `-DFP16` to activate the
conversion intrinsics if available on your machine. Note that AVX-512
has thoses natively.
- Arm architectures can provide native float16 arithmetic. For 32-bits and
64-bits chips (ARMv7 and Aarch64) chips float16 support is optional. When
compiling with `-DFP16`, NSIMD will use float16-related intrinsics. Note
that for SVE chips float16's are mandatory hence NSIMD will use appropriate
intrinsics with or without `-DFP16`.
- CUDA provides supports for converting float16's to/from float32's. These
are always used by NSIMD. But it is only since devices of compute
capabilities 5.3 and above that float16's arithmetic is provided. NSIMD will
always use CUDA float16's functions so there is no need to compile with
`-DFP16`.
- ROCm HIP supports float16's except for the first versions. For now NSIMD
assumes that it is always the case and use HIP float16 API. There is no
need for `-DFP16`.
## Float16's related functions and types
NSIMD provide the `f16` type which represents a IEEE float16. Note that
depending on the targeted architecture and the presence of `-DFP16` the float16
type can typedefs many different types. Therefore the two following functions
are provided and can be used to convert a float16 from/to a float32. These
functions preserve NaN's and infinities. When converting from a float32 to
a float16 saturation to infinities is performed when the float32 cannot be
represented as a float16.
| Function signature | Availability |
|---------------------------------------------------|--------------|
| `f16 nsimd_f32_to_f16(f32 a);` | C and C++ |
| `f32 nsimd_f16_to_f32(f16 a);` | C and C++ |
| `f16 nsimd::f32_to_f16(f32 a);` | C++ only |
| `f32 nsimd::f16_to_f32(f16 a);` | C++ only |
For loading/storing float16's NSIMD provides other conversion function to/from
16-bits unsigned integers. The integers will hold the IEEE binary
representation of the float16's.
| Function signature | Availability |
|---------------------------------------------------|--------------|
| `u16 nsimd_f32_to_u16(f32 a);` | C and C++ |
| `f32 nsimd_u16_to_f32(u16 a);` | C and C++ |
| `u16 nsimd::f32_to_u16(f32 a);` | C++ only |
| `f32 nsimd::u16_to_f32(u16 a);` | C++ only |
The `nsimd_*` functions listed above do not use the same linkage type depending
on the targeted architecture. When compiling for GPUs the corresponding symbols
names are mangled. They use C++ ABI because the float16 type is defined as a
C++ class and not as a C struct. We therefore inherit from the implementation
of CUDA and HIP/ROCm. Linkage types are listed below.
| Function signature | CUDA/ROCm | Other architectures |
|-----------------------------------|-------------|---------------------|
| `f16 nsimd_f32_to_f16(f32 a);` | C++ linkage | C linkage |
| `f32 nsimd_f16_to_f32(f16 a);` | C++ linkage | C linkage |
| `f16 nsimd::f32_to_f16(f32 a);` | C++ linkage | C++ linkage |
| `f32 nsimd::f16_to_f32(f16 a);` | C++ linkage | C++ linkage |
| `u16 nsimd_f32_to_u16(f32 a);` | C++ linkage | C linkage |
| `f32 nsimd_u16_to_f32(u16 a);` | C++ linkage | C linkage |
| `u16 nsimd::f32_to_u16(f32 a);` | C++ linkage | C++ linkage |
| `f32 nsimd::u16_to_f32(u16 a);` | C++ linkage | C++ linkage |
It is possible to know at compile time in which situation we are. The
`NSIMD_C_LINKAGE_FOR_F16` macro if defined means that C linkage is used for
`nsimd_*` functions.
================================================
FILE: doc/markdown/how_tests_are_done.md
================================================
<!--
Copyright (c) 2021 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
# How tests are done?
First and foremost note that this is a work in progress and that we are doing
our best to have serious testing of the library.
We can also state our conclusion on testing: we are not and never will be
satisfied with our tests, there are not enough of them, we want more.
The current system has on average 15000 tests by SIMD extensions. Thanks to
our "Python" approach we can automatically generate tests for all operators
and for all types. This has greatly helped us in finding bugs. But, as you
know, bugs are always there.
## Why write this?
Testing the library has been taken seriously since its very beginning. Tests
have gone through several stages:
- The first one was during the development of the first version of the library.
Tests of operators were done with random numbers as input. Those random
numbers were all powers of 2 to ease the comparisons of basic arithmetic
types. NaNs and infinities were not generated as inputs and operators
behaviors with those inputs were not tested
- For the second stage random numbers generators have been improved to emit
NaNs and infinities. It allowed us to detect many errors in operators,
mostly in math functions like cos, sin, exp... But we also discovered bugs
in hardware when NaNs and infinities are given to intrinsics.
- The third stage which the current test system takes into account the
experience we gain with the privous two. As we have abandonned the buggy and
slow implementations of math functions coming from Boost.SIMD and now rely on
the excellent Sleef (<https://sleef.org/>) we trust that the math functions
are correctly tested. In more details we do not generate NaNs and infinities
anymore because we trust functions coming from Sleef and we do not want
to write code in our tests to bypass hardware bugs. We only care that our
wrapping are correct adn that `nsimd::add` correctly calls add, the fact that
the add does not work correctly is a hardware bug then and not the
problem of the library.
Tests on floatting points are done using ULPs. ULP means units in the last
place and is commonly used for the comparison of floatting point numbers.
It is in general a bad idea to compare floats with the `==` operators as
it essentially compares bits. Instead we want to check if the results of
two computations are "not to far away from each other". When checking an
operator, let's say, on CPUs and GPUs, we to take into account that
- the rounding mode may be different and
- the precision of the calculation may be different.
## ULPs
This chapter is dedicated to math proof concerning ULPs. Indeed people use
this notion but proofs are hard to find. We give our own definition of distance
in ULPs, compare it to the usual one and give pros and cons.
We assume the reader is familiar with basic mathematics.
For this entire chapter fix the following:
- an integer $b > 1$ (will be our radix),
- an integer $p > 1$ (will be the number of digits in the mantissa)
- an integer $M > 1$ (will be the minimum exponent allowed for floatting
point numbers)
A floatting point number is an element of $\mathbb{R}$ of the form
$m b^e$ with $e \geq -M$ and $m \in \mathbb{Z}$. More precisely we define
the set of floatting point numbers $F$ to be the union of the following two
sets:
- $\{ mb^e \in F \text{ with } e > -M \}$ the *normal* numbers.
- $\{ mb^{-M} \in F \text{ with } m \in \mathbb{Z} \text{ and }
0 < |m| < b^p \}$ the *denormal* or *subnormal* numbers.
The set $F$ can be viewed as a subset of $\mathbb{R}$ with the mapping
$\phi : (m, e) \mapsto mb^e$ and we will make this abuse of
notation in what follows. Usually the sign of the floatting point number
is separated from $m$ but we include it "inside" $m$ as it does not change
the proofs below and simplifies the notations.
Let $a_i \in F$ for $i = 1,2$ such that $a_i = m_i b^{e_i}$.
**Proposition:** $\phi$ is injective.
**Proof:** Suppose that $a_1 = a_2$ or $m_1b^{e_1} = m_2b^{e_2}$. If $a_1$
and $a_2$ are subnormal numbers then $e_1 = e_2 = -M$ and $m_1 = m_2$. If
$a_1$ and $a_2$ are normal numbers suppose that $e_2 > e_1$, then
$|\frac{m_2b^{e_2}}{m_1b^{e_1}}| > b^{e_2 + p - 1 - e_1 - p}
= b^{e_2 - e_1 - 1} \geq b^{1 - 1} = 1$ therefore
$m_2b^{e_2} \neq m_1b^{e_1}$ which is absurd hence $e_1 = e_2$ and as a
consequence $m_1 = m_2$.
**Definition:** We define the *distance in ULPs between $a_1$ and $a_2$*
denoted by $U(a_1, a_2)$ to be:
- $|m_1b^{e_1 - e_2} - m_2|$ if $e_1 \geq e_2$,
- $|m_1 - m_2b^{e_2 - e_1}|$ otherwise.
**Example:** Take $a_1 = 123456 \times 10^5$ and $a_2 = 123789 \times 10^5$
Then as the exponents of $a_1$ and $a_2$ are the same we have
$U(123456 \times 10^5, 123789 \times 10^5) = |123789 - 123456| = 333$.
The following proposition confort the name "units in the last place".
**Proposition:** Let $f = \lfloor \log_b U(a_1, a_2) \rfloor + 1$ and suppose
that $a_1, a_2$ are of same sign and have the same exponents, then either the
first $p - f$ digits of $m_1$ and $m_2$ are identical or their difference is
$\pm 1$.
**Proof:** For $i = 1,2$ there exists $q_i \in \mathbb{Z}$ and
$0 \leq r_i < b^f$ such that $m_i = q_i b^f + r_i$. Then
$|q_1 - q_2| \leq \frac{|m_1 - m_2| + |r_1 - r_2|}{b^f}
< \frac{b^{\log_b(U(a_1, a_2)} + b^f}{b^f} = 2$
So that either $q_1 = q_2$ or $q_1 - q_2 = \pm 1$. It is interesting to know
what are the cases when $q_1 - q_2 \pm 1$. Suppose that $0 \leq m_1 < m_2$
and that $q_1 = q_2 + 1$ then $m_1 = q_1 b^f + r_1 \geq q_2 b^f + b^f >
q_2 b^f + r_2 = m_2$ which contradicts the hypothesis hence $q_1 \leq q_2$.
Finally $r_1 + U(a_1, a_2) = r_1 + (m_2 - m_1) = q_2 b^f + r_2 - q_1 b^f
= r_2 + b_f$ so that:
- $r_1 + U(a_1, a_2) \geq b^f$ and
- $r_1 = r_2 + (b_f - U(a_1, a_2)) = r_2 + (b^f - b^{\log_b(U(a_1, a_2))})
> r_2$.
**Example:** Taking back $a_1 = 123456 \times 10^5$ and
$a_2 = 123789 \times 10^5$. As $q_1 = q_2$ we have the first 3 digits of $a_1$
and $a_2$ that are identical and they differ by their last
$\log_{10} \lfloor U(a_1, a_2) \rfloor + 1
= \lfloor \log_{10}(333) \rfloor + 1 = 3$
**Example:** Now take $a_1 = 899900 \times 10^5$ and
$a_2 = 900100 \times 10^5$. We have $f = 3$ but $q_2 = q_1 + 1$ and
$r_2 = 900 > 100 = r_1$ and $r_2 + U(a_1, a_2) = 1100 \geq 1000 = 10^3$.
The propositions above show that our definition of the ULP distance is well
choosen as we have the following results:
- (second proposition) is measures de number of different digits at the end
of the mantissa.
- (first proposition) if we write the numbers differently but still in base $b$
we only change the number of different digits in the last places by some
zeros. The latter number being the exponent of $b$ that represents the
difference in scaling of both representations of floatting point numbers.
We show now how to compute it using the IEEE 754 floatting point numbers
representation. A floatting point number $(m, e) \in F$ is stored in memory
(and registers) as the integer $\pm ((e + M)b^p + |m|)$.
**Proposition:** If $e_2 \geq e_1 + 2$ then $U(a_1, a_2) \geq b^p$.
**Proof:** We have $U(a_1, a_2) = |m_2 b^{e_2 - e_1} - m_1|
\geq ||m_2| b^{e_2 - e_1} - |m_1||$. But $m_2$ is a normal number otherwise we
would have $e_2 = -M = e_1$ so that $|m_2| \geq b^{p - 1}$ and we have
$|m_2| b^{e_2 - e_1} \geq b^{p - 1 + e_2 - e_1} \geq b^{p + 1} > |m_1|$,
therefore $||m_2| b^{e_2 - e_1} - |m_1|| \geq |m_2|b^2 - |m_1|
> b^{p - 1 + 2} - b^p = b^p$.
The proposition above basically states that if two floatting point numbers
are two orders of magnitude away then that have no digits in common, and
that there are godd chances that comparing them is not interesting at all.
The usual definition of the distance in ULPs is roughly given as the number
of floatting point numbers between the two considered floatting point numbers.
More precisely we will denote it by $V$ and it is defined as follows:
- $V(a_1, a_2) = |(e_1 + M)b^p + |m_1| - (e_2 + M)b^p - |m_2||$ if $a_1$ and
$a_2$ have the same signs
- $V(a_1, a_2) = (e_1 + M)b^p + |m_1| + (e_2 + M)b^p + |m_2|$ otherwise.
**Proposition:** If $e_1 = e_2$ and $a_1$, $a_2$ have the same sign then
$U(a_1, a_2) = V(a_1, a_2)$.
**Proof:** We have $V(a_1, a_2) = |(e_1 + M)b^p + m_1 - (e_2 + M)b^p - m_2|$,
but as $e_1 = e_2$, we end up with $V(a_1, a_2) = |m_1 - m_2| = U(a_1, a_2)$.
**Proposition:** $V(a_1, a_2) = 1$ is equivalent to $U(a_1, a_2) = 1$.
**Proof:** The proposition is true if $e_1 = e_2$. Suppose that $e_2 > e_1$.
Note that $a_2$ is a normal number so that $m_2 \geq b^{p - 1}$.
We first suppose that $V(a_1, a_2) = 1$. Then by the definition of $V$, $a_1$
and $a_2$ have same sign otherwise $V(a_1, a_2) \geq 2$ and we suppose that
$a_i \geq 0$. Moreover we have $e_2 = e_1 + 1$ otherwise we would have that
$a_1 = m_1b^{e_1} < m_1b^{e_1 + 1} < m_2b^{e_1 + 2} \leq a_2$. Now we have
$(b^p - 1)b^{e_1} < b^{p - 1}b^{e_1 + 1}$ and let
$(b^p - 1)b^{e_1} \leq mb^e \leq b^{p - 1}b^{e_1 + 1}$.
First note that if $a = mb^e$ is a normal number then $m \geq b^{p - 1}$ and if
$a$ is a subnormal number then $e = -M$ in which case we also have $e_1 = -M$
and $m \geq b^p - 1 \geq b^{p - 1}$. In any case $m \geq b^{p - 1}$.
We have $(b^p - 1)/m b^{e_1} < b^e < b^{p - 1}/m b^{e_1 + 1}$. But
$1 \leq (b^p - 1) / m$ and $b^{p - 1} / m \leq 1$ so that
$b^{e_1} \leq b^e \leq b^{e_1 + 1}$ and $e = e_1$ or $e = e_1 + 1$. In the
first case $(b^p - 1)b^{e_1} \leq mb^{e_1}$ so that $b^p - 1 \leq m$ but
$m < b^p$ and $m = b^p - 1$. In the second case
$mb^{e_1 + 1} \leq b^{p - 1}b^{e_1 + 1}$ so that $m \leq b^{p - 1}$ but
$b^{p - 1} \leq m$ and $m = b^{p - 1}$. We have proven that two consecutive
elements of $F$ with $e_2 = e_1 + 1$ are neessary of the form
$a_1 = (b^p - 1)b^{e_1}$ and $a_2 = b^{p - 1}b^{e_1 + 1}$. Now we can compute
$U(a_1, a_2) = |bb^{p - 1} - (b^p - 1)| = 1$.
Conversely, suppose that $U(a_1, a_2) = 1$, then
$|b^{e_2 - e_1}m_2 - m_1| = 1$. Suppose that $b^{e_2 - e_1}m_2 - m_1 = -1$,
then $-1 \geq bb^{p - 1} - b^p = 0$ which is absurd. We then have
$b^{e_2 - e_1}m_2 - m_1 = 1$. Suppose that $e_2 \geq e_1 + 2$ then we would
have that $b^{e_2 - e_1}m_2 - m_1 \geq b^2b^{p - 1} - b^p \geq b^p$ which is
absurd so that $e_2 = e_1 + 1$ and $bm_2 - m_1 = 1$. Suppose that
$m_2 \geq b^{p - 1} + 1$ then $bm_2 - m_1 \geq b^p + b - (b^p - 1) \geq 2$
which is absurd so that $m_2 = b^{p - 1}$ and as a consequence $m_1 = b^p - 1$.
If $a_1, a_2 < 0$, then $V(a_1, a_2) = 1$ is equivalent by definition to
$V(-a_1, -a_2) = 1$ which is equivalent to $U(-a_1, -a_2) = 1$ which is
by definition equivalent to $U(a_1, a_2) = 1$.
**Proposition:** Suppose that $e_1 \leq e_2 \leq e_1 + 1$ then
$V \leq U \leq bV$.
**Proof:** The proposition is true if $e_1 = e_2$. Suppose now that
$e_2 = e_1 + 1$. Then we have
$b^p + m_2 - m_1 \geq b^p + b^{p - 1} - b^p \geq 0$
so that $V(a_1, a_2) = b^p + m_2 - m_1 = b^p + m_2(1 - b) + bm_2 - m_1$. But
$b^p + m_2(1 - b) \leq b^p + b^p(1 - b) \leq 0$ and
$bm_2 - m_1 \geq bb^{p - 1} - b^p = 0$ so that $V(a_1, a_2) \leq bm_2 - m_1
= U(a_1, a_2)$. On the other hand we have $bm_2 - m_1
\leq b(b^p + m_2 - m_1 + m_1 - m_1/b - b^p)$ but
$m_1 - m_1/b - b^p \leq b^p - b^{p - 1}/b - b^p \leq 0$ so that
$U(a_1, a_2) \leq b(b^p + m_2 - m_1) = bV(a_1, a_2)$.
**Remark:** The previous propositions shows that the difference between $V$
and $U$ is only visible when the arguments have differents exponents and
are non consecutive. Our version of the distance in ULPs puts more weights
when crossing powers of $b$. Also if $e_2 \geq e_1 + 2$ then we have seen that
$a_1$ and $a_2$ have nothing in common which is indicated by the fact that
$U, V \geq b^p$.
**Definition:** We now define the relative distance $D(a_1, a_2)$ between
$a_1$ and $a_2$ to be $|a_1 - a_2| / \min(|a_1|, |a_2|)$.
**Proposition:** As $U$ is defined in a "mathematical" way compared to $V$ then
the relation between $U$ and $D$ is straightforward and we have
$D(a_1, a_2) = U(a_1, a_2) / |m_1|$. Moreover we have
$b^{-q}U \leq D \leq b^{1 - q}U$ where $q$ is the greatest integer such that
$b^{q - 1} \leq |m_1| < b^q$. In particular if $a_1$ is a normal number then
$p = q$.
**Proof:** Suppose that $|a_1| < |a_2|$, then we have three cases:
- If $a_2$ is denormal, then so is $a_1$ and $e_1 = -M = e_2$.
- If $a_2$ is normal, then:
+ If $a_1$ is denormal then $e_1 < e_2$.
+ If $a_1$ and $a_2$ are normal numbers then $|m_1/m_2| b^{e_1 - e_2} < 1$
but $|m_1/m_2| \geq b^{p - 1} / b^p = b^{-1}$ and we have
$b^{e_1 - e_2 - 1} < 1$ so that $e_1 < e_2 + 1$ or $e_1 \leq e_2$.
In any case we have $e_1 \leq e_2$, as a consequence we have
$D(a_1, a_2) = |m_1b^{e_1} - m_2b^{e_2}| / \min(|m_1|b^{e_1}, |m_2|b^{e_2})
= |m_1 - m_2b^{e_2 - e_1}| / \min(|m_1|, |m_2|b^{e_2 - e_1})$. Therefore
$D(a_1, a_2) = U(a_1, a_2) / \min(|m_1|, |m_2|b^{e_2 - e_1})$. Now if
$e_1 = e_2$ then $\min(|m_1|, |m_2|) = |m_1|$ but if $e_2 > e_1$ then $a_2$ is
a normal number and $|m_1| < b^p = b \times b^{p - 1} \leq b^{e_2 - e_1} |m_2|$
and again $\min(|m_1|, |m_2|b^{e_2 - e_1}) = |m_1|$.
Applying $b^{q - 1} \leq |m_1| < b^q$ we get that
$b^{-q}U \leq D \leq b^{1 - q}U$. If moreover $a_1$ is a normal number then
by definition $p = q$.
**Remark:** Using the inequality of the previous proposition and taking the
base-$b$ logarithm we get $-q + \log U \leq \log D \leq 1 - q + \log U$ and
then $-q + \lfloor \log U \rfloor \leq \lfloor \log D \rfloor
\leq 1 - q + \lfloor \log U \rfloor$ hence two possibilities:
- $-q + \lfloor \log U \rfloor = \lfloor \log D \rfloor$ in which case
$\lfloor \log U \rfloor + (-\lfloor \log D \rfloor) = q$.
- $1 - q + \lfloor \log U \rfloor = \lfloor \log D \rfloor$ in which case
$1 + \lfloor \log U \rfloor + (-\lfloor \log D \rfloor) = q$.
According to a above proposition we know that $f = 1 + \lfloor \log U \rfloor$
can be interpreted as the number of differents digits in the last places of the
mantissa. Write $\mathcal{D} = - \lfloor \log D \rfloor$ then
$q \leq f + \mathcal{D} \leq q + 1$. The latter inequality shows that
$\mathcal{D}$ can be interpreted as the number of digits which are the same in
the mantissa near the "first" place. Note that for denormal numbers the "first"
places are near the bit of most significance. We can conclude this remark with
the interpretation that two floatting point numbers have at least
$\mathcal{D} - 1$ digits in common in the first place of the mantissa and $f$
digits which are different in the last place of the mantissa.
**Algorithm:** We give below the C code for $U$ with a caveat. As seen in a
previous proposition when $e_2 \geq e_1 + 2$ the arguments have no digit in
common and can be considered too far away in which case we return `INT_MAX` (or
`LONG_MAX`). As a side effect is that the code will be free of multiprecision
integers (which would be necessary as soon as $|e_2 - e_1| \geq 12$) hence
lesser dependencies, readability, maintainability and performances.
When $|e_2 - e_1| \leq 1$ we use the formula of the definition.
```c
/* We suppose that floats are IEEE754 and not NaN nor infinity */
struct fl_t{
int mantissa;
int exponent;
};
fl_t decompose(float a_) {
fl_t ret;
unsigned int a;
memcpy(&a, &a_, sizeof(float)); /* avoid aliasing */
ret.exponent = (int)((a >> 23) & 0xff) - 127;
if (ret.exponent == -127) {
/* denormal number */
ret.mantissa = (int)(a & 0x007fffff);
} else {
ret.mantissa = (int)((1 << 23) | (a & 0x007fffff));
}
if (a >> 31) {
ret.mantissa = -ret.mantissa;
}
return ret;
}
int distance_ulps(float a_, float b_) {
fl_t a, b;
a = decompose(a_);
b = decompose(b_);
if (a.exponent - b.exponent < -1 || a.exponent - b.exponent > 1) {
return INT_MAX;
}
int d;
if (a.exponent == b.exponent) {
d = a.mantissa = b.mantissa;
} else if (a.exponent > b.exponent) {
d = 2 * a.mantissa - b.mantissa;
} else {
d = 2 * b.mantissa - a.mantissa;
}
return d > 0 ? d : -d;
}
```
The algorithm for computing $\mathcal{D} - 1$ follows:
```c
int d(float a_, float b_) {
float absa = fabsf(a_);
float absb = fabsf(b_);
/* ensure that |a_| <= |b_| */
if (absb < absa) {
float tmp = absa;
absa = absb;
absb = tmp;
}
fl_t a = decompose(absa);
int q = 0;
for (q = 0; q <= 23 && (2 << q) <= a.mantissa; q++);
int ulps = distance_ulps(a_, b_);
int lu;
for (lu = 0; lu <= 30 && (2 << (lu + 1)) <= a.mantissa; lu++);
return q - (lu + 1) - 1;
}
```
## What we really do in the tests
As said above buggy intrinsics can be easily found. But the bugs appears for
corner cases typically involving NaNs and/or infinities. But according to the
philosophy of NSIMD, it is not the job of its standard operators to propose a
non buggy alternative to a buggy intrinsics. But we still have the problem of
testing. A consequence of the philosophy of NSIMD is that we only have to test
that intrinsics are correctly wrapped. We can reasonably assume that testing
for floatting point numbers on only normal numbers is more than sufficient.
Moreover, an implementation (buggy or not), may have different parameters set
that controls how floatting point arithmetic is done on various components of
the chip. An non exhaustive list includes:
- Rounding modes (which is not controlled by NSIMD as it is a library)
- FTZ/DAZ (flush to zero) denormal values never appear.
- FTZ/DAZ on some components (SIMD parts) and not others (scalar parts)
- Non IEEE behavior (eg. some NVIDIA GPU and ARMv7 chips)
- A mix of the above
- A buggy mix of the above
As a consequence we do not compare floats using the operator `=` nor do we
use a weird-buggy formula involving the machine epsilon. Instead we use
the algorithm above to make sure that the first bits are correct. More
precisely we use the following algorithm and its variants for float16 and
doubles where `ufp` stands for `units in the first place`.
```c
/* a_ and b_ must be IEEE754 and normal numbers */
int ufps(float a_, float b_) {
unsigned int a, b;
memcpy(&a, &a_, 4);
memcpy(&b, &b_, 4);
int ea = (int)((a >> 23) & 0xff);
int eb = (int)((b >> 23) & 0xff);
if (ea - eb > 1 || ea - eb < -1) {
return 0;
}
int ma = (int)(a & 0x007fffff);
int mb = (int)(b & 0x007fffff);
int d = 0;
if (ea == eb) {
d = ma - mb;
} else if (ea > eb) {
d = 2 * ma - mb;
} else {
d = 2 * mb - ma);
}
d = (d >= 0 ? d : -d);
int i = 0;
for (; i < 30 && d >= (1 << i); i++);
return 23 - i;
}
```
================================================
FILE: doc/markdown/memory.md
================================================
# Memory functions
Although the purpose of NSIMD is not to provide a full memory container
library, it provides some helper functions to facilitate the end-user. The
functions below only deals with CPUs. If your needs concerns GPUs or memory
transfers between CPUs and GPUs see the [memory management
module](module_memory_management_overview.md).
## Memory functions available in C and C++
- `void *nsimd_aligned_alloc(nsimd_nat n);`
Returns a pointer to `n` bytes of aligned memory. It returns NULL is an
error occurs.
- `void nsimd_aligned_free(void *ptr);`
Frees the memory pointed to by `ptr`.
## Memory functions available in C++
- `void *nsimd::aligned_alloc(nsimd_nat n);`
Returns a pointer to `n` bytes of aligned memory. It returns NULL is an
error occurs.
- `void nsimd::aligned_free(void *ptr);`
Frees the memory pointed to by `ptr`.
- `template <typename T> T *nsimd::aligned_alloc_for(nsimd_nat n);`
Returns a pointer to `n` `T`'s of aligned memory. It returns NULL is an
error occurs.
- `template <typename T> void nsimd::aligned_free_for(void *ptr);`
Free memory pointed to by `ptr`.
## C++ allocators for `std::vector`'s
NSIMD provides C++ allocators so that memory used by C++ container such as
`std::vector`'s will be suitably aligned in memory.
- `template <typename T> class nsimd::allocator;`
The class for allocating aligned memory inside C++ containers.
Exemple:
```c++
#include <nsimd/nsimd.h>
int main() {
int n = // number of float's to allocate
std::vector<float, nsimd::allocator<float> > myvector(size_t(n));
// In what follows ptr is a pointer suitably aligned for the current SIMD
// targeted architecture.
float *ptr;
// C++98
ptr = &myvector[0];
// C++11 and above
ptr = myvector.data();
}
```
As there is no portable way of having aligned scoped memory, one can use the
NSIMD allocators to emulate such memory.
```c++
#include <nsimd/nsimd.h>
template <typename T, int N> void test() {
std::vector<T, nsimd::allocator<T> > mem(size_t(N));
T *ptr;
// C++98
ptr = &mem[0]; // scoped aligned memory
// C++11 and above
ptr = mem.data(); // scoped aligned memory
}
int main() {
test<float, 16>();
test<double, 8>();
}
```
## C++ scoped memory allocation
NSIMD provides a struct helper for the user to allocate a chunk of memory and
don't care about its release. It uses C++ RAII.
```c++
namespace nsimd {
template <typename T> class scoped_aligned_mem_for {
template <typename I> scoped_aligned_mem(I n);
// Construct a struct an array of n T's.
T *get();
// Return the pointer to access memory.
};
}
int main() {
// Allocates 1024 floats in memory. It will be freed when the function (or
// the program) terminates.
nsimd::scoped_aligned_mem_for<float> buffer(1024);
return 0;
}
```
================================================
FILE: doc/markdown/modules/.gitignore
================================================
*/api*.md
================================================
FILE: doc/markdown/modules/fixed_point/overview.md
================================================
<!--
Copyright (c) 2019 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
# NSIMD fixed point module
## Description
This module implements a fixed-point numbers support for the `nsimd` library.
Fixed-point numbers are integer types used to represent decimal numbers. A number `lf`
of bits are used to encode its integer part, and `rt` bits are used to encode its
fractional part.
The fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf, rt>` to
represent a fixed_point number. All the basic floating-point arithmetic operaors have
been defined, therefore fp_t elements can be manipulated as normal numbers.
The fixed_point module will use a `int8_t`, `int16_t`, or `int32_t` integer type for
storage, depending on the value of `lf + 2 * rt`.
All the functions of the module are under the namespace `nsimd::fixed_point`,
and match the same interface than `nsimd`.
The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd `fpsimd_t`
struct type is defined in `simd.hpp`.
The modules redefines the `nsimd` pack type for fixed-point numbers, templated with `lf`
and `rt` :
```C++
namespace nsimd {
namespace fixed_point {
template <uint8_t lf, uint8_t rt>
struct pack;
} // namespace fixed_point
} // namespace nsimd
```
Then, the pack can be manipulated as an `nsimd` pack like other scalar types.
## Compatibility
The fixed point module is a C++ only API, compatible with the C++98 standard.
It has the same compilers and hardware support than the main `nsimd` API
(see the [API index](../../index.md)).
## Example
Here is a minimal example(main.cpp) :
```C++
#include <ctime>
#include <cstdlib>
#include <iostream>
#include <nsimd/modules/fixed_point.hpp>
float rand_float() {
return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f;
}
int main() {
// We use fixed point numbers with 8 bits of integer part and 8 bits of
// decimal part. It will use a 32 bits integer for internal storage.
typedef nsimd::fixed_point::fp_t<8, 8> fp_t;
typedef nsimd::fixed_point::pack<fp_t> fp_pack_t;
const size_t v_size = nsimd::fixed_point::len(fp_t());
fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t));
fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t));
fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t));
// Input and output initializations
for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {
input0[i] = fp_t(rand_float());
input1[i] = fp_t(rand_float());
}
fp_pack_t v0 = nsimd::fixed_point::loadu<fp_pack_t>(input0);
fp_pack_t v1 = nsimd::fixed_point::loadu<fp_pack_t>(input1);
fp_pack_t vres = nsimd::fixed_point::add(v0, v1);
nsimd::fixed_point::storeu(res, vres);
for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {
std::cout << float(input0[i]) << " | "
<< float(input1[i]) << " | "
<< float(res[i]) << "\n";
}
std::cout << std::endl;
return EXIT_SUCCESS;
}
```
To test with avx2 run :
```bash
export NSIMD_ROOT=<path/to/simd>
g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp
./main
```
The console output will look like this :
```console
$>./main
1.35938 | -0.421875 | 0.9375
1.13281 | 1.19531 | 2.32812
1.64844 | -1.21094 | 0.4375
-0.660156 | 1.07422 | 0.414062
-0.890625 | 0.214844 | -0.675781
-0.0898438 | 0.515625 | 0.425781
-0.539062 | 0.0546875 | -0.484375
1.80859 | 1.66406 | 3.47266
```
================================================
FILE: doc/markdown/pack.md
================================================
# NSIMD pack and related functions
The advanced C++ API provides types that represents SIMD registers. These
types are struct that allows NSIMD to define infix operators. In this page
NSIMD concepts are reported in the documentation but you can think of them
as usual `typename`s.
## The Pack type
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct pack {
// Typedef to retrieve the native SIMD type
typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
// Typedef to retrieve T
typedef T value_type;
// Typedef to retrieve SimdExt
typedef SimdExt simd_ext;
// Static member to retrive N
static const int unroll = N;
// Ctor that splats `s`, the resulting vector will be [s, s, s, ...]
template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s);
// Ctor that takes a SIMD vector of native type
// ONLY AVAILABLE when N == 1
pack(simd_vector v);
// Retrieve the underlying native SIMD vector
// ONLY AVAILABLE when N == 1
simd_vector native_register() const;
};
```
Example:
```c++
#include <nsimd/nsimd-all.hpp>
#include <iostream>
int main() {
nsimd::pack<float> v(2.0f);
std::cout << v << '\n';
vf32 nv = v.native_register();
nv = nsimd::add(nv, nv, f32());
std::cout << nsimd::pack<f32>(nv) << '\n';
return 0;
}
```
### Infix operators available for packs
- `pack operator+(pack const &, pack const &);`
- `pack operator*(pack const &, pack const &);`
- `pack operator-(pack const &, pack const &);`
- `pack operator/(pack const &, pack const &);`
- `pack operator-(pack const &);`
- `pack operator|(pack const &, pack const &);`
- `pack operator^(pack const &, pack const &);`
- `pack operator&(pack const &, pack const &);`
- `pack operator~(pack const &);`
- `pack operator<<(pack const &, int);` (only available for integers)
- `pack operator>>(pack const &, int);` (only available for integers)
### Assignment operators available for packs
- `pack operator+=(pack const &);`
- `pack operator-=(pack const &);`
- `pack operator*=(pack const &);`
- `pack operator/=(pack const &);`
- `pack &operator|=(pack const &other);`
- `pack &operator&=(pack const &other);`
- `pack &operator^=(pack const &other);`
- `pack &operator<<=(int);`
- `pack &operator>>=(int);`
### Function aliases
The C++ standard provides functions with different names that does exactly
the same thing. This is due to the retro compatibility with C. Take the
`fmin` C function as an example. In C this function give the minimum between
doubles only. The C++ standard provides overloads to this function so that it
can work on floats and long doubles. The aliases provided by NSIMD have the
same purpose but they are not provided as operator on their own because their
real purpose is to write generic code that can work on scalar and SIMD vector
types. As such they are only relevant for the advanced C++ API.
- `pack fmin(pack const &, pack const &);`
- `pack fmax(pack const &, pack const &);`
- `pack fabs(pack const &);`
They are contained in the `nsimd/cxx_adv_api_aliases.hpp` header and not
provided by default to respect the philosophy of NSIMD which is force the
use to think different between SIMD code and scalar code. They are provided
automatically when including `nsimd/nsimd-all.hpp`.
## The Packl type
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
struct packl {
// Typedef to retrieve the native SIMD type
typedef typename simd_traits<T, SimdExt>::simd_vectorl simd_vectorl;
// Typedef to retrieve T
typedef T value_type;
// Typedef to retrieve SimdExt
typedef SimdExt simd_ext;
// Static member to retrive N
static const int unroll = N;
// Ctor that splats `s`, the resulting vector will be [s, s, s, ...]
template <NSIMD_CONCEPT_VALUE_TYPE S> packl(S const &s);
// Ctor that takes a SIMD vector of native type
// ONLY AVAILABLE when N == 1
packl(simd_vectorl v);
// Retrieve the underlying native SIMD vector
// ONLY AVAILABLE when N == 1
simd_vector native_register() const;
};
```
Example:
```c++
#include <nsimd/nsimd-all.hpp>
#include <iostream>
int main() {
nsimd::pack<float> v(2.0f);
nsimd::packl<float> mask;
mask = nsimd::eq(v, v);
std::cout << v << '\n';
mask = nsimd::neq(v, v);
std::cout << v << '\n';
return 0;
}
```
### Infix operators involving packls
- `packl operator&&(packl const &, packl const &);`
- `packl operator||(packl const &, packl const &);`
- `packl operator!(packl const &, packl const &);`
- `packl operator==(pack const &, pack const &);`
- `packl operator!=(pack const &, pack const &);`
- `packl operator<(pack const &, pack const &);`
- `packl operator<=(pack const &, pack const &);`
- `packl operator>(pack const &, pack const &);`
- `packl operator>=(pack const &, pack const &);`
## Packs for SoA/AoS
Types containing several SIMD vectors are also provided to help the user
manipulate arrays of structures. When working, let's say, on complex numbers,
loading them from memory with layout `RIRIRIRIRIRI...` can be done with the
`load2*` operators that will returns 2 SIMD vectors `RRRR` and `IIII` where
`R` stands for real part and `I` for imaginary part.
Similarily loading an RGB image from memory stored following the layout
`RGBRGBRGBRGB...` can be done with `load3*` to get 3 SIMD vectors `RRRR`,
`GGGG` and `BBBB`.
### Packx1
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx1 {
// Usual typedefs and static members
typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
typedef T value_type;
typedef SimdExt simd_ext;
static const int unroll = N;
static const int soa_num_packs = 1;
// Member v0 for reading and writing
pack<T, N, SimdExt> v0;
};
```
### Packx2
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx2 {
// Usual typedefs and static members
typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
typedef T value_type;
typedef SimdExt simd_ext;
static const int unroll = N;
static const int soa_num_packs = 2;
// Members for reading and writing
pack<T, N, SimdExt> v0;
pack<T, N, SimdExt> v1;
};
```
### Packx3
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx3 {
// Usual typedefs and static members
typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
typedef T value_type;
typedef SimdExt simd_ext;
static const int unroll = N;
static const int soa_num_packs = 3;
// Members for reading and writing
pack<T, N, SimdExt> v0;
pack<T, N, SimdExt> v1;
pack<T, N, SimdExt> v2;
};
```
### Packx4
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>
NSIMD_STRUCT packx4 {
// Usual typedefs and static members
typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;
typedef T value_type;
typedef SimdExt simd_ext;
static const int unroll = N;
static const int soa_num_packs = 4;
// Members for reading and writing
pack<T, N, SimdExt> v0;
pack<T, N, SimdExt> v1;
pack<T, N, SimdExt> v2;
pack<T, N, SimdExt> v3;
};
```
### Functions involving packx2, packx3 and packx4
The following functions converts packxs into unrolled packs. The difference
between the `to_pack` and `to_pack_interleave` families of functions is in
the way they flatten (or deinterleave) the structure of SIMD vectors.
```c++
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2 * N, SimdExt> to_pack(const packx2<T, N, SimdExt> &);
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3 * N, SimdExt> to_pack(const packx3<T, N, SimdExt> &);
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4 * N, SimdExt> to_pack(const packx4<T, N, SimdExt> &);
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 2 * N, SimdExt> to_pack_interleave(const packx2<T, N, SimdExt> &);
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 3 * N, SimdExt> to_pack_interleave(const packx3<T, N, SimdExt> &);
template <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>
pack<T, 4 * N, SimdExt> to_pack_interleave(const packx4<T, N, SimdExt> &);
```
The `to_pack` family of functions performs the following operations:
```
packx2<T, 3> = | v0 = [u0 u1 u2] | ---> [u0 u1 u2 w0 w1 w2] = pack<T, 6>
| v1 = [w0 w1 w2] |
```
while the `to_pack_interleave` family of functions does the following:
```
packx2<T, 3> = | v0 = [u0 u1 u2] | ---> [u0 w0 v1 w1 v2 w2] = pack<T, 6>
| v1 = [w0 w1 w2] |
```
================================================
FILE: doc/markdown/tutorial.md
================================================
<!--
Copyright (c) 2020 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
# NSIMD tutorial
In this tutorial we will write and compile a simple SIMD kernel to become
familiar with the basics of NSIMD. We will also see different aspects of SIMD
programming:
- aligned vs. unaligned data access
- basic SIMD arithmetic
- SIMD loops
- SIMD branching
- architecture selection at runtime
## SIMD basics
SIMD programming means using the CPU SIMD registers to performs operations
on several data at once. A SIMD vector should be viewed as a set of bits
which are interpreted by the operators that operate on them. Taking a 128-bits
wide SIMD register, it can be interpreted as:
- 16 signed/unsigned chars
- 8 signed/unsigned shorts
- 4 signed/unsigned ints
- 4 floats
- 2 signed/unsigned longs
- 2 doubles
as shown in the picture below.

## Computation kernel
We will explain the rewriting of the following kernel which uppercases ASCII
letters only.
@[INCLUDE_CODE:L7:L16](../../examples/tutorial.cpp)
Here is the corresponding SIMD version. Explanations to follow.
@[INCLUDE_CODE:L18:L39](../../examples/tutorial.cpp)
## Getting started with NSIMD
All APIs of NSIMD core is available with this include:
@[INCLUDE_CODE:L1:L1](../../examples/tutorial.cpp)
For ease of programming with use the NSIMD namespace inside the
`uppercase_simd` function.
@[INCLUDE_CODE:L20:L20](../../examples/tutorial.cpp)
## SIMD vectors
A `nsimd::pack<T>` can be considered analogous to a SIMD register (on your or
any other machine). Operations performed on packs - from elementary operations
such as addition to complicated functions such as `nsimd::rsqrt11(x)` - will be
performed using SIMD registers and operations if supported by your hardware. As
shown below, data must be manually loaded into and stored from these registers.
Again, for ease of programming we typedef a pack of T's.
@[INCLUDE_CODE:L21:L21](../../examples/tutorial.cpp)
NSIMD provides another type of pack called `nsimd::packl` which handles vectors
of booleans.
@[INCLUDE_CODE:L22:L22](../../examples/tutorial.cpp)
This distinction between pack's and packl's is necessary ffor two reasons:
- On recent hardware, SIMD vectors of booleans are handled by dedicated
registers.
- Pack and Packl must have different semantics as arithmetic operators on
booleans have no sense as well as logical operators on Pack's.
## Loading data from memory
One way to construct a `nsimd::pack<T>` is to simply declare
(default-construct) it. Such a pack may *not* be zero-initialized and thus may
*contain arbitrary values*.
Another way to construct a `nsimd::pack<T>` is to fill it with a single value.
This so-called splatting constructor takes one scalar value and replicates it
in all elements of the pack.
But most common usage to construct a `nsimd::pack<T>` is by using the copy
constructor from loading functions.
@[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp)
## Aligned vs. unaligned memory
Alignement of a given pointer `ptr` to memory to some value `A` means that
`ptr % A == 0`. On older hardware loading data from unaligned memory can
result in performance penalty. On recent hardware it is hard to exhibit a
difference. NSIMD provides two versions of "load":
- `loada` for loading data from aligned memory
- `loadu` for loading data from unaligned momery
Note that using `loada` on unaligned pointer may result in segfaults. As
recent hardware have good support for unaligned memory we use `loadu`.
@[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp)
To ensure that data allocated by `std::vector` is aligned, NSIMD provide
a C++ allocator.
```c++
std::vector<T, nsimd::allocator<T> > data;
```
When loading data from memory you must ensure that there is sufficient data in
the block of memory you load from to fill a `nsimd::pack<T>`. For example, on
an `AVX` capable machine, a SIMD vector of `float` (32 bits) contains 8
elements. Therefore, there must be at least 8 floats in the memory block you
load data from otherwise loading may result in segfaults. More on this below.
## Operations on pack's and packl's
Once initialized, `nsimd::pack<T>` instances can be used to perform arithmetic.
Usual operations are provided by NSIMD such:
- addition
- substraction
- multiplication
- division
- square root
- bitwise and/or/xor
- ...
@[INCLUDE_CODE:L28:L29](../../examples/tutorial.cpp)
C++ operators are also overloaded for pack's and packl's as well as between
pack's and scalars or packl's and booleans.
## SIMD branching
NSIMD provide the `if_else` operator which fill the output, lane by lane,
according to the lane value of its first argument:
- if it is true, the output lane will be filled with the second argument's lane
- if it is false, the output lane will be filled with the third argument's lane
Therefore the branching:
@[INCLUDE_CODE:L10:L14](../../examples/tutorial.cpp)
will be rewritten as
@[INCLUDE_CODE:L28:L30](../../examples/tutorial.cpp)
or as a one liner
@[INCLUDE_CODE:L36:L36](../../examples/tutorial.cpp)
## SIMD loops
A SIMD loop is similar to its scalar counterpart except that instead of
going through data one element at a time it goes 4 by 4 or 8 by 8 elements
at a time. More precisely SIMD loops generally goes from steps equal to
pack's length. Therefore the scalar loop
@[INCLUDE_CODE:L9:L9](../../examples/tutorial.cpp)
is rewritten as
@[INCLUDE_CODE:L23:L26](../../examples/tutorial.cpp)
Note that going step by step will only cover most of the data except maybe the
tail of data in case that the number of elements is not a multiple of the
Pack's length. Therefore to perform computations on the tail one has to
load data from only `n` elements where `n < len<p_t>()`. One can use
`maskz_loadu` which will load data only on lanes that are marked as true by
another argument to the function.
@[INCLUDE_CODE:L35:L35](../../examples/tutorial.cpp)
The mask can be computed manually but NSIMD provides a function for it.
@[INCLUDE_CODE:L34:L34](../../examples/tutorial.cpp)
Then the computation on the tail is exactly the same as within the loop. Put
together it gives for the tail:
@[INCLUDE_CODE:L34:L37](../../examples/tutorial.cpp)
Then the entire loop reads as follows.
@[INCLUDE_CODE:L25:L37](../../examples/tutorial.cpp)
## Compiling the Code
Here is the complete listing of the code.
@[INCLUDE_CODE](../../examples/tutorial.cpp)
The compilation of a program using `nsimd` is like any other library.
```bash
c++ -O3 -DAVX2 -mavx2 -L/path/to/lib -lnsimd_avx2 -I/path/to/include tutorial.cpp
```
When compiling with NSIMD, you have to decide at compile time the targeted
SIMD extensions, AVX2 in the example above. It is therefore necessary to
give `-mavx2` to the compiler for it to emit AVX2 instructions. To tell NSIMD
that AVX2 has to be used the `-DAVX2` has to be passed to the compiler. For
an exhaustive list of defines controlling compilation see <defines.md>. There
is a .so file for each SIMD extension, it is therefore necessary to link
against the proper .so file.
## Runtime selection of SIMD extensions
It is sometimes necessary to have several versions of a given algorithm for
different SIMD extensions. This is rather to do with NSIMD. Basically the
idea is to write the algorithm in a generic manner using pack's as shown above.
It is then sufficient to compile the same soure file for different SIMD
extensions and then link the resulting object files altogether. Suppose that
a file named `uppercase.cpp` contains the following code:
@[INCLUDE_CODE:L18:L38](../../examples/tutorial.cpp)
This would give the following in a Makefile.
```makefile
all: uppercase
uppercase_sse2.o: uppercase.cpp
c++ -O3 -DSSE2 -msse2 -c $? -o $@
uppercase_sse42.o: uppercase.cpp
c++ -O3 -DSSE42 -msse4.2 -c $? -o $@
uppercase_avx.o: uppercase.cpp
c++ -O3 -DAVX -mavx -c $? -o $@
uppercase_avx2.o: uppercase.cpp
c++ -O3 -DAVX2 -mavx2 -c $? -o $@
uppercase: uppercase_sse2.o \
uppercase_sse42.o \
uppercase_avx.o \
uppercase_avx2.o
main.cpp
c++ $? -lnsimd_avx2 -o $@
```
Note that `libnsimd_avx2` contains all the functions for SSE 2, SSE 4.2, AVX
and AVX2. This is a consequence of the retrocompatiblity of Intel SIMD
extensions. The situation is the same on ARM where `libnsimd_sve.so` will
contain functions for AARCH64.
There is a small caveat. The symbol name corresponding to the `uppercase_simd`
function will be same for all the object files which will result in error
when linking together all objects. To avoid this situation one can use
function overloading as follows:
```c++
template <typename T>
void uppercase_simd(NSIMD_SIMD, T *dst, const T *src, int n) {
// ...
}
```
The macro `NSIMD_SIMD` will be expanded to a type containing the information on
the SIMD extension currently requested by the user. This techniques is called
tag dispatching and does not require *any* modification of the algorithm
inside the function. Finally in `main` one has to do dispatching by using
either `cpuid` of by another mean.
```c++
int main() {
// what follows is pseudo-code
switch(cpuid()) {
case cpuid_sse2:
uppercase(nsimd::sse2, dst, src, n);
break;
case cpuid_sse42:
uppercase(nsimd::sse42, dst, src, n);
break;
case cpuid_avx:
uppercase(nsimd::avx, dst, src, n);
break;
case cpuid_avx2:
uppercase(nsimd::avx2, dst, src, n);
break;
}
return 0;
}
```
================================================
FILE: doc/md2html.cpp
================================================
/*
Copyright (c) 2020 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <ns2.hpp>
#include <stdexcept>
#include <utility>
#include <string>
#include <vector>
// ----------------------------------------------------------------------------
// Extract lines form strings like ":L7:L42"
// Returns -1 if fails
std::pair<int, int> extract_lines(std::string const &s) {
std::pair<int, int> r(-1, -1);
std::vector<std::string> lines = ns2::split(s, ":L");
if (lines.size() == 3 && lines[0] == "") {
try {
r.first = std::stoi(lines[1]);
r.second = std::stoi(lines[2]);
} catch (std::exception const &) {
r.first = -1;
r.second = -1;
}
}
return r;
}
// ----------------------------------------------------------------------------
std::string callback_input_filename = "";
std::string callback_macro(std::string const &label, std::string const &url,
ns2::markdown_infos_t const &markdown_infos) {
std::string filename;
if (ns2::startswith(label, "INCLUDE")) {
filename = ns2::join_path(ns2::dirname(callback_input_filename), url);
}
std::string lang;
if (ns2::startswith(label, "INCLUDE_CODE")) {
std::string const ext = ns2::splitext(filename).second;
if (ext == "sh") {
lang = "Bash";
} else if (ext == "c" || ext == "h") {
lang = "C";
} else if (ext == "cpp" || ext == "hpp") {
lang = "C++";
} else if (ext == "py") {
lang = "Python";
}
}
if (ns2::startswith(label, "INCLUDE_CODE:")) {
std::string const lines_str = label.substr(label.find(':'));
std::pair<int, int> const l_first_last = extract_lines(lines_str);
if (l_first_last.first == -1) {
throw std::runtime_error("cannot extract first line number");
}
if (l_first_last.second == -1) {
throw std::runtime_error("cannot extract last line number");
}
std::string out;
std::string lines;
{
ns2::ifile_t in(filename);
int num_line = 1;
std::string line;
while (std::getline(in, line)) {
if (num_line == l_first_last.second) {
lines += line;
} else if (num_line < l_first_last.second) {
if (num_line >= l_first_last.first) {
lines += line + "\n";
}
} else {
break;
}
++num_line;
}
}
ns2::compile_markdown("```" + lang + "\n" + ns2::deindent(lines) +
"\n```\n",
&out, markdown_infos);
return out;
}
if (ns2::startswith(label, "INCLUDE_CODE")) {
std::string out;
ns2::compile_markdown("```" + lang + "\n" + ns2::read_file(filename) +
"\n```\n",
&out, markdown_infos);
return out;
}
if (ns2::startswith(label, "INCLUDE")) {
ns2::ifile_t in(filename);
std::ostringstream out;
ns2::compile_markdown(&in, &out, markdown_infos);
return out.str();
}
return "";
}
// ----------------------------------------------------------------------------
std::pair<std::string, bool>
callback_link(std::string const &label, std::string const &url,
ns2::markdown_infos_t const &markdown_infos) {
if (markdown_infos.output_format != ns2::HTML) {
return std::pair<std::string, bool>("", false);
}
std::pair<std::string, std::string> root_basename_ext = ns2::splitext(url);
if (root_basename_ext.second == "md") {
return std::pair<std::string, bool>(
ns2::html_href(root_basename_ext.first + ".html", label), true);
} else {
return std::pair<std::string, bool>("", false);
}
}
// ----------------------------------------------------------------------------
int main(int argc, char **argv) {
if (argc != 3) {
std::cout << "Usage: " << argv[0] << " <input_file> <output_file>"
<< std::endl;
return 1;
}
std::string const input_filename = argv[1];
std::string const output_filename = argv[2];
ns2::ifile_t input_file(input_filename);
ns2::ofile_t output_file(output_filename);
std::cout << "Convert \"" << input_filename << "\" to \"" << output_filename
<< "\"" << std::endl;
callback_input_filename = input_filename;
ns2::markdown_infos_t markdown_infos(ns2::HTML, callback_macro,
callback_link, true);
ns2::compile_markdown(&input_file, &output_file, markdown_infos);
return 0;
}
================================================
FILE: doc/what_is_wrapped.cpp
================================================
/*
Copyright (c) 2021 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
This little C++ program reads and parses files from NSIMD wrapping intrinsics
in order to build a markdown page describing in a table which operators are
just intrinsics wrapper and which one are more complicated. We only to parse
C code so no need for complicated stuff. Moreover what we doo is really simple
and a C parser is not needed.
We replace all C delimiters by spaces, then split the resulting string into
words and we get a vector of strings. Then search in it the function that we
want (say nsimd_add_sse2_f32) along with its opening curly and closing
brakets and finally:
- if there is only one token then it must be an intrinsic
- if there is a for then it must use emulation
- if there are several tokens but no for it must be a trick using other
intrinsics
The produced markdown contains:
- E for emulation
- T for trick with other intrinsics
- NOOP for noop
- a link to the Intel/Arm documentation about the intrinsic otherwise
Well all that to say that a few hundreds of simple C++ code is more that
enough for our need and we don't need to depend on some C/C++ parser such
as Clang. Note that using a real parser will be counter productive as some
intrinsics are implemented as macros to compiler builtin which then appear
in the AST instead of the documented intrinsics.
This code is completely non-optimized and we don't care because it does not
take time to execute and it is not our purpose to optimize this code.
*/
// ----------------------------------------------------------------------------
#include <ns2.hpp>
#include <utility>
#include <string>
#include <vector>
// ----------------------------------------------------------------------------
#define MAX_LEN (11 * 11)
typedef std::map<std::string, std::string[MAX_LEN]> table_t;
std::string type_names_str("i8,u8,i16,u16,i32,u32,i64,u64,f16,f32,f64");
std::vector<std::string> types_list(ns2::split(type_names_str, ","));
const size_t not_found = ~((size_t)0);
// ----------------------------------------------------------------------------
int nbits(std::string const &typ) {
if (typ == "i8" || typ == "u8") {
return 8;
} else {
return (10 * (typ[1] - '0')) + (typ[2] - '0');
}
}
// ----------------------------------------------------------------------------
std::vector<std::string> get_types_names(std::string const &output) {
std::vector<std::string> const& list = types_list;
if (output == "same") {
return list;
}
std::vector<std::string> ret;
for (size_t i = 0; i < list.size(); i++) {
for (size_t j = 0; j < list.size(); j++) {
if ((output == "same_size" && nbits(list[j]) == nbits(list[i])) ||
(output == "bigger_size" && nbits(list[j]) == 2 * nbits(list[i])) ||
(output == "lesser_size" && 2 * nbits(list[j]) == nbits(list[i]))) {
ret.push_back(list[j] + "_" + list[i]);
}
}
}
return ret;
}
// ----------------------------------------------------------------------------
size_t find(std::vector<std::string> const &haystack,
std::string const &needle, size_t i0 = 0) {
for (size_t i = i0; i < haystack.size(); i++) {
if (haystack[i] == needle) {
return i;
}
}
return not_found;
}
// ----------------------------------------------------------------------------
size_t find_by_prefix(std::vector<std::string> const &needles,
std::string const &haystack) {
for (size_t i = 0; i < needles.size(); i++) {
if (ns2::startswith(haystack, needles[i])) {
return i;
}
}
return not_found;
}
// ----------------------------------------------------------------------------
int is_number(std::string const &s) {
for (size_t i = 0; i < s.size(); i++) {
if (s[i] != 'x' && s[i] != 'l' && s[i] != 'L' && s[i] != 'u' &&
s[i] != 'U' && !(s[i] >= '0' && s[i] <= '9')) {
return false;
}
}
return true;
}
// ----------------------------------------------------------------------------
int is_macro(std::string const &s) {
for (size_t i = 0; i < s.size(); i++) {
if (s[i] != '_' || !(s[i] >= 'A' && s[i] <= 'Z')) {
return false;
}
}
return true;
}
// ----------------------------------------------------------------------------
void parse_file(std::string const &input_vars, std::string const &simd_ext,
std::vector<std::string> const &types_names,
std::string const &op_name, std::string const &filename,
table_t *table_) {
table_t &table = *table_;
std::string content(ns2::read_file(filename));
// replace all C delimiters by spaces except {}
for (size_t i = 0; i < content.size(); i++) {
const char delims[] = "()[];,:+-*/%&|!%\n\t\r";
for (size_t j = 0; j < sizeof(delims); j++) {
if (content[i] == delims[j]) {
content[i] = ' ';
break;
}
}
}
// replace '{' by ' { ' and same for '}' in case there are some code
// just before/after it
content = ns2::replace(ns2::replace(content, "}", " } "), "{", " { ");
// now split string on spaces and removes some tokens
std::vector<std::string> to_be_removed(
ns2::split("return,signed,unsigned,char,short,int,long,float,double,"
"const,void,__vector,__bool,bool,vector" +
type_names_str + "," + input_vars,
','));
std::vector<std::string> to_be_removed_by_prefix(ns2::split(
"_mm_cast,_mm256_cast,_mm512_cast,vreinterpret,svreinterpret,svptrue_",
','));
std::vector<std::string> tokens;
{ // to free tokens0 afterwards
std::vector<std::string> tokens0 = ns2::split(content, ' ');
for (size_t i = 0; i < tokens0.size(); i++) {
// We also remove svptrue_* as they are everywhere for SVE and all
// casts as they incur no opcode and are often used for intrinsics
// not supporting certain types
if (tokens0[i].size() == 0 || is_number(tokens0[i]) ||
is_macro(tokens0[i]) ||
find_by_prefix(to_be_removed_by_prefix, tokens0[i]) != not_found ||
find(to_be_removed, tokens0[i]) != not_found) {
continue;
}
tokens.push_back(tokens0[i]);
}
}
// finally search for intrinsics
for (size_t typ = 0; typ < types_names.size(); typ++) {
std::string func_name("nsimd_" + op_name + "_" + simd_ext + "_" +
types_names[typ]);
// find func_name
size_t pos = find(tokens, func_name);
if (pos == not_found) {
table[op_name][typ] = "NA";
continue;
}
// find opening {
size_t i0 = find(tokens, "{", pos);
if (i0 == not_found) {
std::cerr << "WARNING: cannot find opening '{' for '" << func_name
<< "' in '" << filename << "'\n";
table[op_name][typ] = "NA";
continue;
}
// find closing }
size_t i1 = i0;
int nest = 0;
for (i1 = i0; i1 < tokens.size(); i1++) {
if (tokens[i1] == "{") {
nest++;
} else if (tokens[i1] == "}") {
nest--;
}
if (nest == 0) {
break;
}
}
// if there is no token inside {} then it must be a noop
// if there is only one token inside {} then it must be the intrinsic
// if there is a for loop then it must be emulation
// if there are several tokens but no for then it must be a trick
if (i0 + 1 == i1) {
table[op_name][typ] = "NOOP";
} else if (i0 + 2 == i1 && !ns2::startswith(tokens[i0 + 1], "nsimd_")) {
table[op_name][typ] = "[`" + tokens[i0 + 1] + "`]";
if (simd_ext == "neon128" || simd_ext == "aarch64") {
table[op_name][typ] +=
"(https://developer.arm.com/architectures/instruction-sets/"
"intrinsics/" + tokens[i0 + 1] + ")";
} else if (ns2::startswith(simd_ext, "sve")) {
table[op_name][typ] +=
"(https://developer.arm.com/documentation/100987/0000)";
} else if (simd_ext == "sse2" || simd_ext == "sse42" ||
simd_ext == "avx" || simd_ext == "avx2" ||
simd_ext == "avx512_knl" || simd_ext == "avx512_skylake") {
table[op_name][typ] += "(https://software.intel.com/sites/landingpage/"
"IntrinsicsGuide/#text=" +
tokens[i0 + 1] + ")";
} else if (simd_ext == "vsx" || simd_ext == "vmx") {
table[op_name][typ] +=
"(https://www.ibm.com/docs/en/xl-c-aix/13.1.3?topic=functions-" +
ns2::replace(tokens[i0 + 1], "_", "-") + ")";
}
} else {
if (find(std::vector<std::string>(tokens.begin() + i0,
tokens.begin() + (i1 + 1)),
"for") != not_found) {
table[op_name][typ] = "E";
} else {
table[op_name][typ] = "T";
}
}
}
}
// ----------------------------------------------------------------------------
std::string md_row(int nb_col, std::string const &cell_content) {
std::string ret("|");
for (int i = 0; i < nb_col; i++) {
ret += cell_content + "|";
}
return ret;
}
// ----------------------------------------------------------------------------
int main(int argc, char **argv) {
if ((argc % 2) != 0 || argc <= 5) {
std::cout
<< "Usage: " << argv[0]
<< " a0,a1,a2 simd_ext output_type operator1 file1 operator2 file2 "
"...\n"
<< "where output_type is (same|same_size|bigger_size|lesser_size)"
<< std::endl;
return 1;
}
std::string input_vars(argv[1]);
std::string simd_ext(argv[2]);
std::string output_type(argv[3]);
std::vector<std::string> types_names = get_types_names(output_type);
table_t table;
for (int i = 4; i < argc; i += 2) {
parse_file(input_vars, simd_ext, types_names, argv[i], argv[i + 1],
&table);
}
for (table_t::const_iterator it = table.begin(); it != table.end(); it++) {
std::cout << "## " << it->first << "\n\n";
if (output_type == "same") {
const std::string(&row)[MAX_LEN] = it->second;
for (size_t i = 0; i < types_list.size(); i++) {
std::cout << "- " << it->first << " on **" << types_list[i]
<< "**: " << row[i] << "\n";
}
std::cout << "\n\n";
} else {
const std::string(&row)[MAX_LEN] = it->second;
for (size_t i = 0; i < types_list.size(); i++) {
for (size_t j = 0; j < types_list.size(); j++) {
std::string cell_content;
std::string typ(types_list[j] + "_" + types_list[i]);
for (size_t k = 0; k < types_names.size(); k++) {
if (typ == types_names[k]) {
cell_content = row[k];
break;
}
}
if (cell_content.size() > 0) {
std::cout << "- " << it->first << " from **" << types_list[i]
<< "** to **" << types_list[j] << "**: " << cell_content
<< "\n";
}
}
std::cout << "\n";
}
std::cout << "\n";
}
}
return 0;
}
================================================
FILE: egg/__init__.py
================================================
# Copyright (c) 2019 Agenium Scale
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from . import operators
================================================
FILE: egg/common.py
================================================
# Use utf-8 encoding
# -*- coding: utf-8 -*-
# Copyright (c) 2020 Agenium Scale
#
# permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# -----------------------------------------------------------------------------
# What does this script?
# ----------------------
#
# This is only a python module that holds what is shared by `generate.py`,
# the `platform_*.py` files and all other python code in `egg`. If contains
# the list of supported types, functions, operators, and some useful helper
# functions such as the python equivalent of `mkdir -p`.
# -----------------------------------------------------------------------------
# Import section
import math
import os
import sys
import io
import collections
import platform
import string
import shutil
import math
# -----------------------------------------------------------------------------
# print
def myprint(opts, obj):
if opts.list_files:
return
print('-- {}'.format(obj))
# -----------------------------------------------------------------------------
# check if file exists
def can_create_filename(opts, filename):
if opts.list_files:
print(filename)
return False
if opts.verbose:
sys.stdout.write('-- {}: '.format(filename))
if os.path.isfile(filename) and not opts.force:
if opts.verbose:
sys.stdout.write('skipping\n')
return False
elif opts.force:
if opts.verbose:
sys.stdout.write('creating (forced)\n')
return True
else:
if opts.verbose:
sys.stdout.write('creating (missing)\n')
return True
# -----------------------------------------------------------------------------
# open with UTF8 encoding
def open_utf8(opts, filename):
dummy, ext = os.path.splitext(filename)
if ext.lower() in ['.c', '.h', '.cpp', '.hpp', '.cc', '.cxx', '.hxx',
'.hpp']:
begin_comment = '/*'
end_comment = '*/'
elif ext.lower() in ['.md', '.htm', '.html']:
begin_comment = '<!--'
end_comment = '-->'
else:
begin_comment = None
with io.open(filename, mode='w', encoding='utf-8') as fout:
if begin_comment is not None:
if opts.simple_license:
fout.write('''{}
Copyright (c) 2021 Agenium Scale
{}
'''.format(begin_comment, end_comment))
else:
fout.write('''{}
Copyright (c) 2021 Agenium Scale
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
{}
'''.format(begin_comment, end_comment))
fout.write('{} This file has been auto-generated {}\n\n'.\
format(begin_comment, end_comment))
return io.open(filename, mode='a', encoding='utf-8')
# -----------------------------------------------------------------------------
# clang-format
def clang_format(opts, filename, cuda=False):
with io.open(filename, 'a', encoding='utf-8') as fout:
fout.write('\n')
if not opts.enable_clang_format:
# TODO: not sure if needed to implement a smarter call to clang-format
if cuda:
os.system('clang-format -style="{{ Standard: Cpp11 }}" -i {}'. \
format(filename))
else:
os.system('clang-format -style="{{ Standard: Cpp03 }}" -i {}'. \
format(filename))
if
gitextract_56lzr4bw/
├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benches/
│ └── benches.hpp
├── build.nsconfig
├── doc/
│ ├── Makefile.nix
│ ├── Makefile.win
│ ├── markdown/
│ │ ├── compilers_and_versions.md
│ │ ├── concepts.md
│ │ ├── defines.md
│ │ ├── faq.md
│ │ ├── fp16.md
│ │ ├── how_tests_are_done.md
│ │ ├── memory.md
│ │ ├── modules/
│ │ │ ├── .gitignore
│ │ │ └── fixed_point/
│ │ │ └── overview.md
│ │ ├── pack.md
│ │ └── tutorial.md
│ ├── md2html.cpp
│ └── what_is_wrapped.cpp
├── egg/
│ ├── __init__.py
│ ├── common.py
│ ├── cuda.py
│ ├── experiments/
│ │ ├── gen_sleef_operators.py
│ │ ├── round-ppc.c
│ │ └── upcvt-sve.c
│ ├── gen_adv_c_api.py
│ ├── gen_adv_cxx_api.py
│ ├── gen_archis.py
│ ├── gen_base_apis.py
│ ├── gen_benches.py
│ ├── gen_doc.py
│ ├── gen_friendly_but_not_optimized.py
│ ├── gen_modules.py
│ ├── gen_scalar_utilities.py
│ ├── gen_src.py
│ ├── gen_tests.py
│ ├── get_sleef_code.py
│ ├── hatch.py
│ ├── modules/
│ │ ├── fixed_point/
│ │ │ ├── gen_doc.py
│ │ │ ├── gen_tests.py
│ │ │ └── hatch.py
│ │ ├── memory_management/
│ │ │ └── hatch.py
│ │ ├── random/
│ │ │ └── hatch.py
│ │ ├── spmd/
│ │ │ └── hatch.py
│ │ └── tet1d/
│ │ └── hatch.py
│ ├── oneapi.py
│ ├── operators.py
│ ├── platform_arm.py
│ ├── platform_cpu.py
│ ├── platform_ppc.py
│ ├── platform_x86.py
│ ├── rocm.py
│ ├── scalar.py
│ └── x86_load_store_deg234.py
├── examples/
│ ├── module_fixed_point.cpp
│ └── tutorial.cpp
├── include/
│ └── nsimd/
│ ├── c_adv_api.h
│ ├── cxx_adv_api.hpp
│ ├── cxx_adv_api_aliases.hpp
│ ├── modules/
│ │ ├── fixed_point.hpp
│ │ ├── memory_management.hpp
│ │ ├── spmd.hpp
│ │ └── tet1d.hpp
│ ├── nsimd-all.h
│ ├── nsimd-all.hpp
│ └── nsimd.h
├── scripts/
│ ├── FindNSIMD.cmake
│ ├── aarch64-linux-gnu-clang++.sh
│ ├── aarch64-linux-gnu-clang.sh
│ ├── build-tests.bat
│ ├── build-tests.sh
│ ├── build.bat
│ ├── build.sh
│ ├── ci-clang.txt
│ ├── ci-scale.txt
│ ├── ci-test.txt
│ ├── ci.sh
│ ├── compile-gmp-mpfr-for-wasm.sh
│ ├── gen_github_doc.sh
│ ├── hipcc.sh
│ ├── init-benches-deps.sh
│ ├── local-ci-rerun.ini
│ ├── local-ci.ini
│ ├── local-ci.sh
│ ├── one-liner.c
│ ├── powerpc64le-linux-gnu-clang++.sh
│ ├── powerpc64le-linux-gnu-clang.sh
│ ├── setup.bat
│ └── setup.sh
├── src/
│ ├── dd.h
│ ├── df.h
│ ├── estrin.h
│ ├── fp16.cpp
│ ├── gpu.cpp
│ ├── helperadvsimd.h
│ ├── helperavx.h
│ ├── helperavx2.h
│ ├── helperavx512f.h
│ ├── helperneon32.h
│ ├── helperpower_128.h
│ ├── helpersse2.h
│ ├── helpersve.h
│ ├── memory.cpp
│ ├── misc.h
│ ├── rempitab.c
│ ├── rename.h
│ ├── renameadvsimd.h
│ ├── renameavx.h
│ ├── renameavx2.h
│ ├── renameavx512f.h
│ ├── renameneon32.h
│ ├── renamesse2.h
│ ├── renamesse4.h
│ ├── renamesve.h
│ ├── renamevsx.h
│ ├── sleefdp.c
│ ├── sleefsimddp.c
│ ├── sleefsimddp_emulation.c
│ ├── sleefsimdsp.c
│ ├── sleefsimdsp_emulation.c
│ ├── sleefsp.c
│ └── ufp.cpp
└── tests/
├── CMakeLists.txt.sh
├── FindNSIMD.cmake.sh
├── allocator.cpp
├── assign_arith.cpp
├── booleans.cpp
├── c11_vec.c
├── cxx_adv_api_aliases.cpp
├── fp16.prec11.c
├── get_pack.cpp
├── memory.cpp
├── memory.prec11.c
├── modules/
│ └── common.hpp
├── nsimd-all.cpp
├── nsimd.cpp
├── nsimd.prec11.c
├── operator_vector_scalar.cpp
├── shifts.cpp
├── templated_loads_stores.cpp
├── tests_helpers.hpp
├── to_pack.cpp
├── to_pack_interleave.cpp
└── ufp.cpp
Showing preview only (331K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (3964 symbols across 88 files)
FILE: benches/benches.hpp
type nsimd (line 8) | namespace nsimd {
type benches (line 9) | namespace benches {
function rand_sign (line 12) | double rand_sign() {
function T (line 21) | T rand_bits(T min, T max = std::numeric_limits<T>::max()) {
function T (line 35) | T rand_from(T min, T max = std::numeric_limits<T>::max()) {
function T (line 42) | T rand_fp(T min, T max) {
function T (line 55) | T rand(T min, T max = std::numeric_limits<T>::max()) {
FILE: doc/md2html.cpp
function extract_lines (line 36) | std::pair<int, int> extract_lines(std::string const &s) {
function callback_macro (line 55) | std::string callback_macro(std::string const &label, std::string const &...
function callback_link (line 130) | std::pair<std::string, bool>
function main (line 148) | int main(int argc, char **argv) {
FILE: doc/what_is_wrapped.cpp
function nbits (line 80) | int nbits(std::string const &typ) {
function get_types_names (line 90) | std::vector<std::string> get_types_names(std::string const &output) {
function find (line 110) | size_t find(std::vector<std::string> const &haystack,
function find_by_prefix (line 122) | size_t find_by_prefix(std::vector<std::string> const &needles,
function is_number (line 134) | int is_number(std::string const &s) {
function is_macro (line 146) | int is_macro(std::string const &s) {
function parse_file (line 157) | void parse_file(std::string const &input_vars, std::string const &simd_ext,
function md_row (line 280) | std::string md_row(int nb_col, std::string const &cell_content) {
function main (line 290) | int main(int argc, char **argv) {
FILE: egg/common.py
function myprint (line 50) | def myprint(opts, obj):
function can_create_filename (line 58) | def can_create_filename(opts, filename):
function open_utf8 (line 80) | def open_utf8(opts, filename):
function clang_format (line 136) | def clang_format(opts, filename, cuda=False):
function logical (line 226) | def logical(typ):
function get_arg (line 269) | def get_arg(i):
function get_args (line 274) | def get_args(n):
function get_simds_deps_from_opts (line 280) | def get_simds_deps_from_opts(opts):
function bitsize (line 287) | def bitsize(typ):
function sizeof (line 292) | def sizeof(typ):
function ilog2 (line 295) | def ilog2(x):
function get_output_types (line 309) | def get_output_types(from_typ, output_to):
function mkdir_p (line 340) | def mkdir_p(path):
function enum (line 352) | def enum(l):
function get_one_type_generic (line 377) | def get_one_type_generic(param, typ):
function get_one_type_specific (line 403) | def get_one_type_specific(param, ext, typ):
function get_one_type_pack (line 429) | def get_one_type_pack(param, inout, N):
function get_one_type_generic_adv_cxx (line 458) | def get_one_type_generic_adv_cxx(param, T, N):
function get_one_type_scalar (line 484) | def get_one_type_scalar(param, t):
function get_first_discriminating_type (line 494) | def get_first_discriminating_type(params):
function pprint_lines (line 503) | def pprint_lines(what):
function pprint_commas (line 506) | def pprint_commas(what):
function pprint_includes (line 509) | def pprint_includes(what):
function parse_signature (line 515) | def parse_signature(signature):
function get_platforms (line 528) | def get_platforms(opts):
function get_modules (line 545) | def get_modules(opts):
function ext_from_lang (line 580) | def ext_from_lang(lang):
function nsimd_category (line 583) | def nsimd_category(category):
function to_filename (line 589) | def to_filename(op_name):
function get_markdown_dir (line 596) | def get_markdown_dir(opts):
function get_markdown_api_file (line 599) | def get_markdown_api_file(opts, name, module=''):
function get_markdown_file (line 607) | def get_markdown_file(opts, name, module=''):
FILE: egg/cuda.py
function get_impl_f16 (line 30) | def get_impl_f16(operator, totyp, typ):
function reinterpret (line 76) | def reinterpret(totyp, typ):
function get_impl (line 99) | def get_impl(operator, totyp, typ):
FILE: egg/experiments/round-ppc.c
function pp (line 4) | void pp(const char *prefix, FILE *out, float buf[4]) {
function main (line 13) | int main() {
FILE: egg/experiments/upcvt-sve.c
function len32 (line 8) | int len32() {
function print32 (line 12) | void print32(FILE *out, const char *var, svfloat32_t a) {
function svfloat32_t (line 25) | svfloat32_t iota32(float i0) {
function len64 (line 35) | int len64() {
function print64 (line 39) | void print64(FILE *out, const char *var, svfloat64_t a) {
function main (line 55) | int main() {
FILE: egg/gen_adv_c_api.py
function get_c11_types (line 28) | def get_c11_types(simd_ext):
function get_c11_overloads (line 133) | def get_c11_overloads(op, simd_ext):
function doit (line 284) | def doit(opts):
FILE: egg/gen_adv_cxx_api.py
function get_cxx_advanced_generic (line 30) | def get_cxx_advanced_generic(operator):
function gen_assignment_operators (line 169) | def gen_assignment_operators(op):
function doit (line 176) | def doit(opts):
FILE: egg/gen_archis.py
function get_simd_implementation_src (line 31) | def get_simd_implementation_src(operator, simd_ext, from_typ, fmtspec):
function get_simd_implementation (line 153) | def get_simd_implementation(opts, operator, mod, simd_ext):
function gen_archis_write_put (line 209) | def gen_archis_write_put(opts, platform, simd_ext, simd_dir):
function gen_archis_write_file (line 285) | def gen_archis_write_file(opts, op, platform, simd_ext, simd_dir):
function gen_archis_simd (line 318) | def gen_archis_simd(opts, platform, simd_ext, simd_dir):
function gen_archis_types (line 323) | def gen_archis_types(opts, simd_dir, platform, simd_ext):
function gen_archis_platform (line 405) | def gen_archis_platform(opts, platform):
function doit (line 417) | def doit(opts):
FILE: egg/gen_base_apis.py
function get_c_base_generic (line 30) | def get_c_base_generic(operator):
function get_cxx_base_generic (line 51) | def get_cxx_base_generic(operator):
function get_put_decl (line 69) | def get_put_decl():
function doit (line 92) | def doit(opts):
FILE: egg/gen_benches.py
function sig_replace_name (line 31) | def sig_replace_name(sig, name):
function sig_translate (line 36) | def sig_translate(sig, translates, name=None):
class BenchError (line 52) | class BenchError(RuntimeError):
function asm_marker (line 58) | def asm_marker(simd, bench_name):
class StaticInitMetaClass (line 90) | class StaticInitMetaClass(type):
method __new__ (line 91) | def __new__(cls, name, bases, dct):
class TypeBase (line 104) | class TypeBase(object, metaclass=StaticInitMetaClass):
method __static_init__ (line 107) | def __static_init__(c):
method is_simd (line 113) | def is_simd(self):
method is_volatile (line 116) | def is_volatile(self):
class TypeVectorBase (line 119) | class TypeVectorBase(TypeBase):
method is_simd (line 120) | def is_simd(self):
class TypeVoid (line 125) | class TypeVoid(TypeBase):
method as_type (line 128) | def as_type(self, typ):
class TypeScalar (line 133) | class TypeScalar(TypeBase):
method as_type (line 136) | def as_type(self, typ):
method code_load (line 139) | def code_load(self, simd, typ, ptr):
method code_store (line 142) | def code_store(self, simd, typ, lhs, rhs):
class TypeVolatileScalar (line 147) | class TypeVolatileScalar(TypeScalar):
method is_volatile (line 150) | def is_volatile(self):
class TypeLogicalScalar (line 155) | class TypeLogicalScalar(TypeBase):
method as_type (line 158) | def as_type(self, typ):
method code_load (line 168) | def code_load(self, simd, typ, ptr):
method code_store (line 171) | def code_store(self, simd, typ, lhs, rhs):
class TypeVolatileLogicalScalar (line 176) | class TypeVolatileLogicalScalar(TypeLogicalScalar):
method is_volatile (line 179) | def is_volatile(self):
class TypeInt (line 184) | class TypeInt(TypeScalar):
method as_type (line 187) | def as_type(self, typ):
class TypePtr (line 192) | class TypePtr(TypeBase):
method as_type (line 195) | def as_type(self, typ):
class TypeConstPtr (line 200) | class TypeConstPtr(TypeBase):
method as_type (line 203) | def as_type(self, typ):
class TypeVector (line 208) | class TypeVector(TypeVectorBase):
method as_type (line 211) | def as_type(self, typ):
method code_load (line 214) | def code_load(self, simd, typ, ptr):
method code_store (line 217) | def code_store(self, simd, typ, ptr, expr):
class TypeCPUVector (line 222) | class TypeCPUVector(TypeVector):
method code_load (line 225) | def code_load(self, simd, typ, ptr):
method code_store (line 228) | def code_store(self, simd, typ, ptr, expr):
class TypeUnrolledVectorBase (line 233) | class TypeUnrolledVectorBase(TypeVectorBase):
method as_type (line 234) | def as_type(self, typ):
method code_load (line 237) | def code_load(self, simd, typ, ptr):
method code_store (line 241) | def code_store(self, simd, typ, ptr, expr):
class TypeUnrolledVector1 (line 246) | class TypeUnrolledVector1(TypeUnrolledVectorBase):
class TypeUnrolledVector2 (line 250) | class TypeUnrolledVector2(TypeUnrolledVectorBase):
class TypeUnrolledVector3 (line 254) | class TypeUnrolledVector3(TypeUnrolledVectorBase):
class TypeUnrolledVector4 (line 258) | class TypeUnrolledVector4(TypeUnrolledVectorBase):
class TypeUnrolledVector5 (line 262) | class TypeUnrolledVector5(TypeUnrolledVectorBase):
class TypeUnrolledVector6 (line 266) | class TypeUnrolledVector6(TypeUnrolledVectorBase):
class TypeUnrolledVector7 (line 270) | class TypeUnrolledVector7(TypeUnrolledVectorBase):
class TypeUnrolledVector8 (line 274) | class TypeUnrolledVector8(TypeUnrolledVectorBase):
class TypeUnrolledVector9 (line 278) | class TypeUnrolledVector9(TypeUnrolledVectorBase):
class TypeVectorX2 (line 284) | class TypeVectorX2(TypeVectorBase):
method as_type (line 287) | def as_type(self, typ):
class TypeVectorX3 (line 292) | class TypeVectorX3(TypeVectorBase):
method as_type (line 295) | def as_type(self, typ):
class TypeVectorX4 (line 300) | class TypeVectorX4(TypeVectorBase):
method as_type (line 303) | def as_type(self, typ):
class TypeLogical (line 308) | class TypeLogical(TypeVectorBase):
method as_type (line 311) | def as_type(self, typ):
method code_load (line 314) | def code_load(self, simd, typ, ptr):
method code_store (line 317) | def code_store(self, simd, typ, ptr, expr):
class TypeCPULogical (line 322) | class TypeCPULogical(TypeLogical):
method code_load (line 325) | def code_load(self, simd, typ, ptr):
method code_store (line 328) | def code_store(self, simd, typ, ptr, expr):
class TypeUnrolledLogicalBase (line 333) | class TypeUnrolledLogicalBase(TypeVectorBase):
method as_type (line 334) | def as_type(self, typ):
method code_load (line 337) | def code_load(self, simd, typ, ptr):
method code_store (line 341) | def code_store(self, simd, typ, ptr, expr):
class TypeUnrolledLogical1 (line 346) | class TypeUnrolledLogical1(TypeUnrolledLogicalBase):
class TypeUnrolledLogical2 (line 350) | class TypeUnrolledLogical2(TypeUnrolledLogicalBase):
class TypeUnrolledLogical3 (line 354) | class TypeUnrolledLogical3(TypeUnrolledLogicalBase):
class TypeUnrolledLogical4 (line 358) | class TypeUnrolledLogical4(TypeUnrolledLogicalBase):
class TypeUnrolledLogical5 (line 362) | class TypeUnrolledLogical5(TypeUnrolledLogicalBase):
class TypeUnrolledLogical6 (line 366) | class TypeUnrolledLogical6(TypeUnrolledLogicalBase):
class TypeUnrolledLogical7 (line 370) | class TypeUnrolledLogical7(TypeUnrolledLogicalBase):
class TypeUnrolledLogical8 (line 374) | class TypeUnrolledLogical8(TypeUnrolledLogicalBase):
class TypeUnrolledLogical9 (line 378) | class TypeUnrolledLogical9(TypeUnrolledLogicalBase):
class TypeBoostSimdVector (line 384) | class TypeBoostSimdVector(TypeVectorBase):
method as_type (line 387) | def as_type(self, typ):
method code_load (line 390) | def code_load(self, simd, typ, ptr):
method code_store (line 393) | def code_store(self, simd, typ, ptr, expr):
class TypeBoostSimdLogicalVector (line 398) | class TypeBoostSimdLogicalVector(TypeVectorBase):
method as_type (line 401) | def as_type(self, typ):
method code_load (line 404) | def code_load(self, simd, typ, ptr):
method code_store (line 407) | def code_store(self, simd, typ, ptr, expr):
class TypeMIPPReg (line 412) | class TypeMIPPReg(TypeVectorBase):
method as_type (line 415) | def as_type(self, typ):
method code_load (line 418) | def code_load(self, simd, typ, ptr):
method code_store (line 421) | def code_store(self, simd, typ, ptr, expr):
class TypeMIPPMsk (line 426) | class TypeMIPPMsk(TypeVectorBase):
method as_type (line 429) | def as_type(self, typ):
method code_load (line 432) | def code_load(self, simd, typ, ptr):
method code_store (line 438) | def code_store(self, simd, typ, ptr, expr):
function type_of (line 446) | def type_of(param):
function as_type (line 452) | def as_type(param, typ):
class BenchOperator (line 458) | class BenchOperator(object, metaclass=type):
method __init__ (line 459) | def __init__(self):
method function_name (line 465) | def function_name(self):
method gen_includes (line 469) | def gen_includes(self, lang):
method match_sig (line 481) | def match_sig(self, signature):
method bench_code_before (line 490) | def bench_code_before(self, typ):
method bench_against_init (line 493) | def bench_against_init(self):
method bench_against_cpu (line 501) | def bench_against_cpu(self):
method bench_against_libs (line 513) | def bench_against_libs(self):
method code_call (line 564) | def code_call(self, typ, args):
method code_ptr_step (line 568) | def code_ptr_step(self, typ, simd):
class BenchOperatorWithNoMakers (line 574) | class BenchOperatorWithNoMakers(BenchOperator):
method __init__ (line 579) | def __init__(self):
class dummy (line 589) | class dummy(operators.MAddToOperators):
method __new__ (line 590) | def __new__(cls, name, bases, dct):
function nsimd_unrolled_fun_from_sig (line 603) | def nsimd_unrolled_fun_from_sig(from_sig, unroll):
function fun_from_sig (line 618) | def fun_from_sig(from_sig):
function std_fun_from_sig (line 626) | def std_fun_from_sig(from_sig):
function std_operator_from_sig (line 629) | def std_operator_from_sig(from_sig, op):
function cpu_fun_from_sig (line 644) | def cpu_fun_from_sig(from_sig):
function sanitize_fun_name (line 653) | def sanitize_fun_name(name):
function code_cast (line 659) | def code_cast(typ, expr):
function code_cast_ptr (line 662) | def code_cast_ptr(typ, expr):
function TODO (line 674) | def TODO(f):
function gen_filename (line 678) | def gen_filename(f, simd, typ):
function gen_bench_name (line 685) | def gen_bench_name(category, name, unroll=None):
function gen_bench_from_code (line 691) | def gen_bench_from_code(f, typ, code, bench_with_timestamp):
function gen_bench_info_from (line 795) | def gen_bench_info_from(f, simd, typ):
function gen_bench_asm_function (line 814) | def gen_bench_asm_function(f, simd, typ, category):
function gen_bench_from_basic_fun (line 855) | def gen_bench_from_basic_fun(f, simd, typ, category, unroll=None):
function gen_code (line 939) | def gen_code(f, simd, typ, category):
function gen_bench_unrolls (line 993) | def gen_bench_unrolls(f, simd, typ, category):
function gen_bench_against (line 1002) | def gen_bench_against(f, simd, typ, against):
function gen_bench_with_timestamp (line 1020) | def gen_bench_with_timestamp(f, simd, typ, category, unroll=None):
function gen_bench_unrolls_with_timestamp (line 1090) | def gen_bench_unrolls_with_timestamp(f, simd, typ, category):
function gen_bench_against_with_timestamp (line 1097) | def gen_bench_against_with_timestamp(f, simd, typ, against):
function gen_bench (line 1115) | def gen_bench(f, simd, typ):
function doit (line 1179) | def doit(opts):
FILE: egg/gen_doc.py
function get_command_output (line 41) | def get_command_output(args):
function gen_overview (line 48) | def gen_overview(opts):
function gen_doc (line 290) | def gen_doc(opts):
function gen_modules_md (line 386) | def gen_modules_md(opts):
function build_exe_for_doc (line 416) | def build_exe_for_doc(opts):
function gen_what_is_wrapped (line 432) | def gen_what_is_wrapped(opts):
function get_html_dir (line 521) | def get_html_dir(opts):
function get_html_api_file (line 524) | def get_html_api_file(opts, name, module=''):
function get_html_file (line 533) | def get_html_file(opts, name, module=''):
function get_html_header (line 611) | def get_html_header(opts, title, filename):
function get_html_footer (line 631) | def get_html_footer():
function gen_doc_html (line 636) | def gen_doc_html(opts, title):
function gen_html (line 682) | def gen_html(opts):
function copy_github_file_to_doc (line 688) | def copy_github_file_to_doc(opts, github_filename, doc_filename):
function doit (line 704) | def doit(opts):
FILE: egg/gen_friendly_but_not_optimized.py
function get_impl (line 30) | def get_impl(operator):
function doit (line 69) | def doit(opts):
FILE: egg/gen_modules.py
function doit (line 24) | def doit(opts):
FILE: egg/gen_scalar_utilities.py
function get_gpu_impl (line 31) | def get_gpu_impl(gpu_sig, cuda_impl, rocm_impl, oneapi_sig, oneapi_impl):
function doit (line 73) | def doit(opts):
FILE: egg/gen_src.py
function get_put_impl (line 30) | def get_put_impl(simd_ext):
function write_cpp (line 106) | def write_cpp(opts, simd_ext, emulate_fp16):
function doit (line 119) | def doit(opts):
FILE: egg/gen_tests.py
function should_i_do_the_test (line 32) | def should_i_do_the_test(operator, tt='', t=''):
function cbprng_impl (line 72) | def cbprng_impl(typ, domain_, for_cpu, only_int = False):
function cbprng (line 127) | def cbprng(typ, operator, target, gpu_params = None):
function get_filename (line 221) | def get_filename(opts, op, typ, lang, custom_name=''):
function get_includes (line 240) | def get_includes(lang):
function get_content (line 395) | def get_content(op, typ, lang):
function gen_test (line 596) | def gen_test(opts, op, typ, lang):
function gen_addv (line 651) | def gen_addv(opts, op, typ, lang):
function aligned_alloc_error (line 745) | def aligned_alloc_error():
function equal (line 760) | def equal(typ):
function adds_subs_check_case (line 768) | def adds_subs_check_case():
function random_sign_flip (line 781) | def random_sign_flip():
function zero_out_arrays (line 789) | def zero_out_arrays(typ):
function compute_op_given_language (line 805) | def compute_op_given_language(typ, op, language):
function compare_expected_vs_computed (line 839) | def compare_expected_vs_computed(typ, op, language):
function test_signed_neither_overflow_nor_underflow (line 865) | def test_signed_neither_overflow_nor_underflow(typ, min_, max_, operator,
function test_signed_all_cases (line 897) | def test_signed_all_cases(typ, min_, max_, oper, oper_is_overflow,
function adds_is_overflow (line 932) | def adds_is_overflow(typ, max_):
function adds_signed_is_underflow (line 940) | def adds_signed_is_underflow(typ, min_):
function adds_signed_is_neither_overflow_nor_underflow (line 948) | def adds_signed_is_neither_overflow_nor_underflow(typ):
function test_adds_overflow (line 961) | def test_adds_overflow(typ, max_):
function test_adds_signed_underflow (line 1008) | def test_adds_signed_underflow(typ, min_):
function test_adds_signed_neither_overflow_nor_underflow (line 1052) | def test_adds_signed_neither_overflow_nor_underflow(typ, min_, max_):
function test_adds_signed_all_cases (line 1058) | def test_adds_signed_all_cases(typ, min_, max_):
function tests_adds_signed (line 1063) | def tests_adds_signed():
function test_adds_unsigned_no_overflow (line 1087) | def test_adds_unsigned_no_overflow(typ, max_):
function test_adds_unsigned_all_cases (line 1114) | def test_adds_unsigned_all_cases(typ, max_):
function tests_adds_unsigned (line 1135) | def tests_adds_unsigned():
function get_adds_tests_cases_for_signed_types (line 1153) | def get_adds_tests_cases_for_signed_types(typ, min_, max_):
function get_adds_tests_cases_for_unsigned_types (line 1183) | def get_adds_tests_cases_for_unsigned_types(typ, max_):
function get_adds_tests_cases_given_type (line 1200) | def get_adds_tests_cases_given_type(typ):
function gen_adds (line 1218) | def gen_adds(opts, op, typ, lang):
function subs_signed_is_overflow (line 1309) | def subs_signed_is_overflow(typ, max_):
function subs_signed_is_underflow (line 1317) | def subs_signed_is_underflow(typ, min_):
function subs_signed_is_neither_overflow_nor_underflow (line 1325) | def subs_signed_is_neither_overflow_nor_underflow(typ):
function subs_unsigned_is_underflow (line 1336) | def subs_unsigned_is_underflow(typ):
function test_subs_signed_overflow (line 1348) | def test_subs_signed_overflow(typ, min_, max_):
function test_subs_signed_underflow (line 1402) | def test_subs_signed_underflow(typ, min_, max_):
function test_subs_signed_neither_overflow_nor_underflow (line 1445) | def test_subs_signed_neither_overflow_nor_underflow(typ, min_, max_):
function test_subs_signed_all_cases (line 1451) | def test_subs_signed_all_cases(typ, min_, max_):
function tests_subs_signed (line 1457) | def tests_subs_signed():
function test_subs_unsigned_underflow (line 1480) | def test_subs_unsigned_underflow(typ, min_, max_):
function test_subs_unsigned_no_underflow (line 1515) | def test_subs_unsigned_no_underflow(typ, max_):
function test_subs_unsigned_all_cases (line 1552) | def test_subs_unsigned_all_cases(typ, min_, max_):
function tests_subs_unsigned (line 1574) | def tests_subs_unsigned():
function get_subs_tests_cases_for_signed_types (line 1592) | def get_subs_tests_cases_for_signed_types(typ, min_, max_):
function get_subs_tests_cases_for_unsigned_types (line 1624) | def get_subs_tests_cases_for_unsigned_types(typ, min_, max_):
function get_subs_tests_cases_given_type (line 1643) | def get_subs_tests_cases_given_type(typ):
function gen_subs (line 1663) | def gen_subs(opts, op, typ, lang):
function gen_all_any (line 1746) | def gen_all_any(opts, op, typ, lang):
function gen_load_store (line 1824) | def gen_load_store(opts, op, typ, lang):
function gen_gather_scatter (line 1930) | def gen_gather_scatter(opts, op, typ, lang):
function gen_mask_scatter (line 2057) | def gen_mask_scatter(opts, op, typ, lang):
function gen_maskoz_gather (line 2176) | def gen_maskoz_gather(opts, op, typ, lang):
function gen_mask_load (line 2312) | def gen_mask_load(opts, op, typ, lang):
function gen_mask_store (line 2451) | def gen_mask_store(opts, op, typ, lang):
function gen_load_store_ravel (line 2559) | def gen_load_store_ravel(opts, op, typ, lang):
function gen_iota (line 2640) | def gen_iota(opts, op, typ, lang):
function gen_nbtrue (line 2690) | def gen_nbtrue(opts, op, typ, lang):
function gen_reinterpret_convert (line 2766) | def gen_reinterpret_convert(opts, op, from_typ, to_typ, lang):
function gen_reverse (line 2930) | def gen_reverse(opts, op, typ, lang):
function gen_unpack_half (line 3020) | def gen_unpack_half(opts, op, typ, lang):
function gen_unpack (line 3195) | def gen_unpack(opts, op, typ, lang):
function doit (line 3374) | def doit(opts):
FILE: egg/get_sleef_code.py
function doit (line 29) | def doit(opts):
FILE: egg/hatch.py
function parse_args (line 64) | def parse_args(args):
function main (line 146) | def main():
FILE: egg/modules/fixed_point/gen_doc.py
function gen_overview (line 38) | def gen_overview(opts):
function get_type (line 150) | def get_type(param, return_typ=False):
function gen_decl (line 170) | def gen_decl(op):
function gen_api (line 191) | def gen_api(opts, op_list):
function gen_doc (line 216) | def gen_doc(opts, op_list):
function doit (line 227) | def doit(opts, op_list):
FILE: egg/modules/fixed_point/gen_tests.py
function get_filename (line 27) | def get_filename(opts, op, lf, rt):
function gen_arithmetic_ops_tests (line 164) | def gen_arithmetic_ops_tests(lf, rt, opts):
function gen_minmax_ops_tests (line 227) | def gen_minmax_ops_tests(lf, rt, opts):
function gen_ternary_ops_tests (line 302) | def gen_ternary_ops_tests(lf, rt, opts):
function gen_math_functions_tests (line 368) | def gen_math_functions_tests(lf, rt, opts):
function gen_comparison_tests (line 435) | def gen_comparison_tests(lf, rt, opts):
function gen_bitwise_ops_tests (line 502) | def gen_bitwise_ops_tests(lf, rt, opts):
function gen_unary_ops_tests (line 573) | def gen_unary_ops_tests(lf, rt, opts):
function gen_if_else_tests (line 647) | def gen_if_else_tests(lf, rt, opts):
function doit (line 668) | def doit(opts):
FILE: egg/modules/fixed_point/hatch.py
function name (line 70) | def name():
function desc (line 73) | def desc():
function doc_menu (line 79) | def doc_menu():
function doit (line 85) | def doit(opts):
FILE: egg/modules/memory_management/hatch.py
function name (line 25) | def name():
function desc (line 28) | def desc():
function doc_menu (line 34) | def doc_menu():
function doit (line 39) | def doit(opts):
FILE: egg/modules/random/hatch.py
class MAddToRands (line 31) | class MAddToRands(type):
method __new__ (line 32) | def __new__(cls, name, bases, dct):
class Rand (line 38) | class Rand(object, metaclass=MAddToRands):
method gen_function_name (line 39) | def gen_function_name(self, nwords, word_size, nrounds):
method gen_headers (line 42) | def gen_headers(self, opts):
method gen_tests (line 52) | def gen_tests(self, opts, nrounds, word_size, nwords):
class Philox (line 187) | class Philox(Rand):
method gen_signature (line 297) | def gen_signature(self, nwords, word_size, nrounds):
method get_key_size (line 306) | def get_key_size(self, nwords):
method gen_func (line 309) | def gen_func(self, opts, nrounds, word_size, nwords):
method generate (line 386) | def generate(self, opts):
class ThreeFry (line 398) | class ThreeFry(Rand):
method gen_signature (line 479) | def gen_signature(self, nwords, word_size, nrounds):
method get_key_size (line 487) | def get_key_size(self, nwords):
method gen_body (line 490) | def gen_body(self, opts, nrounds, word_size, nwords):
method generate (line 560) | def generate(self, opts):
function gen_functions (line 574) | def gen_functions(opts):
function gen_tests (line 643) | def gen_tests(opts):
function name (line 663) | def name():
function desc (line 666) | def desc():
function gen_doc (line 672) | def gen_doc(opts):
function doc_menu (line 710) | def doc_menu():
function doit (line 715) | def doit(opts):
FILE: egg/modules/spmd/hatch.py
function append (line 36) | def append(s1, s2):
function get_signature (line 45) | def get_signature(op):
function gen_doc_overview (line 54) | def gen_doc_overview(opts):
function gen_doc_api (line 385) | def gen_doc_api(opts):
function gen_tests_for_shifts (line 436) | def gen_tests_for_shifts(opts, t, operator):
function gen_tests_for_cvt_reinterpret (line 561) | def gen_tests_for_cvt_reinterpret(opts, tt, t, operator):
function gen_tests_for (line 690) | def gen_tests_for(opts, t, operator):
function gen_tests (line 932) | def gen_tests(opts):
function gen_functions (line 953) | def gen_functions(opts):
function name (line 1096) | def name():
function desc (line 1099) | def desc():
function doc_menu (line 1105) | def doc_menu():
function doit (line 1110) | def doit(opts):
FILE: egg/modules/tet1d/hatch.py
function is_not_closed (line 33) | def is_not_closed(operator):
function gen_doc_overview (line 40) | def gen_doc_overview(opts):
function gen_doc_api (line 211) | def gen_doc_api(opts):
function gen_tests_for_shifts (line 284) | def gen_tests_for_shifts(opts, t, operator):
function gen_tests_for (line 397) | def gen_tests_for(opts, tt, t, operator):
function gen_tests (line 649) | def gen_tests(opts):
function gen_functions (line 665) | def gen_functions(opts):
function name (line 873) | def name():
function desc (line 876) | def desc():
function doc_menu (line 883) | def doc_menu():
function doit (line 888) | def doit(opts):
FILE: egg/oneapi.py
function get_impl_f16 (line 42) | def get_impl_f16(operator, totyp, typ):
function reinterpret (line 140) | def reinterpret(totyp, typ):
function get_impl (line 154) | def get_impl(operator, totyp, typ):
FILE: egg/operators.py
class MAddToCategories (line 35) | class MAddToCategories(type):
method __new__ (line 36) | def __new__(cls, name, bases, dct):
class DocCategory (line 48) | class DocCategory(object, metaclass=MAddToCategories):
class DocShuffle (line 54) | class DocShuffle(DocCategory):
class DocTrigo (line 57) | class DocTrigo(DocCategory):
class DocHyper (line 60) | class DocHyper(DocCategory):
class DocExpLog (line 63) | class DocExpLog(DocCategory):
class DocBasicArithmetic (line 66) | class DocBasicArithmetic(DocCategory):
class DocBitsOperators (line 69) | class DocBitsOperators(DocCategory):
class DocLogicalOperators (line 72) | class DocLogicalOperators(DocCategory):
class DocMisc (line 75) | class DocMisc(DocCategory):
class DocLoadStore (line 78) | class DocLoadStore(DocCategory):
class DocComparison (line 81) | class DocComparison(DocCategory):
class DocRounding (line 84) | class DocRounding(DocCategory):
class DocConversion (line 87) | class DocConversion(DocCategory):
class MAddToOperators (line 95) | class MAddToOperators(type):
method __new__ (line 96) | def __new__(cls, name, bases, dct):
class Operator (line 219) | class Operator(object, metaclass=MAddToOperators):
method returns (line 245) | def returns(self):
method args (line 249) | def args(self):
method __init__ (line 252) | def __init__(self):
method get_return (line 256) | def get_return(self):
method tests_mpfr_name (line 259) | def tests_mpfr_name(self):
method bench_mipp_name (line 262) | def bench_mipp_name(self, typ):
method bench_mipp_types (line 265) | def bench_mipp_types(self):
method bench_sleef_name (line 268) | def bench_sleef_name(self, simd, typ):
method bench_sleef_types (line 271) | def bench_sleef_types(self):
method bench_std_name (line 274) | def bench_std_name(self, simd, typ):
method bench_std_types (line 277) | def bench_std_types(self):
method get_header_guard (line 281) | def get_header_guard(self, platform, simd_ext):
method get_fmtspec (line 285) | def get_fmtspec(self, t, tt, simd_ext):
method get_generic_signature (line 312) | def get_generic_signature(self, lang):
method get_signature (line 534) | def get_signature(self, typename, lang, simd_ext):
method get_scalar_signature (line 556) | def get_scalar_signature(self, cpu_gpu, t, tt, lang):
class SrcOperator (line 576) | class SrcOperator(Operator):
class Len (line 583) | class Len(Operator):
class Set1 (line 588) | class Set1(Operator):
class Set1l (line 594) | class Set1l(Operator):
class Loadu (line 601) | class Loadu(Operator):
class MaskoLoadu1 (line 607) | class MaskoLoadu1(Operator):
class MaskzLoadu1 (line 613) | class MaskzLoadu1(Operator):
class Load2u (line 619) | class Load2u(Operator):
class Load3u (line 626) | class Load3u(Operator):
class Load4u (line 633) | class Load4u(Operator):
class Loada (line 640) | class Loada(Operator):
class MaskoLoada (line 646) | class MaskoLoada(Operator):
class MaskzLoada (line 652) | class MaskzLoada(Operator):
class Load2a (line 658) | class Load2a(Operator):
class Load3a (line 665) | class Load3a(Operator):
class Load4a (line 672) | class Load4a(Operator):
class Loadlu (line 679) | class Loadlu(Operator):
class Loadla (line 687) | class Loadla(Operator):
class Storeu (line 695) | class Storeu(Operator):
class MaskStoreu1 (line 701) | class MaskStoreu1(Operator):
class Store2u (line 707) | class Store2u(Operator):
class Store3u (line 714) | class Store3u(Operator):
class Store4u (line 722) | class Store4u(Operator):
class Storea (line 730) | class Storea(Operator):
class MaskStorea1 (line 736) | class MaskStorea1(Operator):
class Store2a (line 742) | class Store2a(Operator):
class Store3a (line 750) | class Store3a(Operator):
class Store4a (line 758) | class Store4a(Operator):
class Gather (line 766) | class Gather(Operator):
class GatherLinear (line 775) | class GatherLinear(Operator):
class Scatter (line 808) | class Scatter(Operator):
class ScatterLinear (line 818) | class ScatterLinear(Operator):
class Storelu (line 838) | class Storelu(Operator):
class Storela (line 846) | class Storela(Operator):
class Orb (line 854) | class Orb(Operator):
class Andb (line 860) | class Andb(Operator):
class Andnotb (line 866) | class Andnotb(Operator):
class Notb (line 873) | class Notb(Operator):
class Xorb (line 879) | class Xorb(Operator):
class Orl (line 885) | class Orl(Operator):
class Andl (line 891) | class Andl(Operator):
class Andnotl (line 897) | class Andnotl(Operator):
class Xorl (line 904) | class Xorl(Operator):
class Notl (line 909) | class Notl(Operator):
class Add (line 916) | class Add(Operator):
class Sub (line 924) | class Sub(Operator):
class Addv (line 932) | class Addv(Operator):
class Mul (line 940) | class Mul(Operator):
class Div (line 946) | class Div(Operator):
class Neg (line 953) | class Neg(Operator):
class Min (line 959) | class Min(Operator):
class Max (line 964) | class Max(Operator):
class Shr (line 969) | class Shr(Operator):
class Shl (line 976) | class Shl(Operator):
class Shra (line 983) | class Shra(Operator):
class Eq (line 990) | class Eq(Operator):
class Ne (line 996) | class Ne(Operator):
class Gt (line 1003) | class Gt(Operator):
class Ge (line 1010) | class Ge(Operator):
class Lt (line 1017) | class Lt(Operator):
class Le (line 1024) | class Le(Operator):
class If_else1 (line 1031) | class If_else1(Operator):
class Abs (line 1040) | class Abs(Operator):
class Fma (line 1045) | class Fma(Operator):
class Fnma (line 1053) | class Fnma(Operator):
class Fms (line 1061) | class Fms(Operator):
class Fnms (line 1069) | class Fnms(Operator):
class Ceil (line 1078) | class Ceil(Operator):
class Floor (line 1083) | class Floor(Operator):
class Trunc (line 1088) | class Trunc(Operator):
class Round_to_even (line 1093) | class Round_to_even(Operator):
class All (line 1098) | class All(Operator):
class Any (line 1104) | class Any(Operator):
class Nbtrue (line 1111) | class Nbtrue(Operator):
class Reinterpret (line 1117) | class Reinterpret(Operator):
class Reinterpretl (line 1125) | class Reinterpretl(Operator):
class Cvt (line 1135) | class Cvt(Operator):
class Upcvt (line 1143) | class Upcvt(Operator):
class Downcvt (line 1152) | class Downcvt(Operator):
class Rec (line 1161) | class Rec(Operator):
class Rec11 (line 1168) | class Rec11(Operator):
class Rec8 (line 1176) | class Rec8(Operator):
class Sqrt (line 1184) | class Sqrt(Operator):
class Rsqrt11 (line 1191) | class Rsqrt11(Operator):
class Rsqrt8 (line 1199) | class Rsqrt8(Operator):
class Ziplo (line 1207) | class Ziplo(Operator):
class Ziphi (line 1216) | class Ziphi(Operator):
class Unziplo (line 1225) | class Unziplo(Operator):
class Unziphi (line 1231) | class Unziphi(Operator):
class Zip (line 1237) | class Zip(Operator):
class Unzip (line 1243) | class Unzip(Operator):
class ToMask (line 1249) | class ToMask(Operator):
class ToLogical (line 1256) | class ToLogical(Operator):
class Iota (line 1264) | class Iota(Operator):
class MaskForLoopTail (line 1271) | class MaskForLoopTail(Operator):
class Adds (line 1280) | class Adds(Operator):
class Subs (line 1286) | class Subs(Operator):
class Sin_u35 (line 1293) | class Sin_u35(SrcOperator):
class Cos_u35 (line 1301) | class Cos_u35(SrcOperator):
class Tan_u35 (line 1310) | class Tan_u35(SrcOperator):
class Asin_u35 (line 1320) | class Asin_u35(SrcOperator):
class Acos_u35 (line 1330) | class Acos_u35(SrcOperator):
class Atan_u35 (line 1340) | class Atan_u35(SrcOperator):
class Atan2_u35 (line 1349) | class Atan2_u35(SrcOperator):
class Log_u35 (line 1359) | class Log_u35(SrcOperator):
class Cbrt_u35 (line 1369) | class Cbrt_u35(SrcOperator):
class Sin_u10 (line 1378) | class Sin_u10(SrcOperator):
class Cos_u10 (line 1386) | class Cos_u10(SrcOperator):
class Tan_u10 (line 1395) | class Tan_u10(SrcOperator):
class Asin_u10 (line 1405) | class Asin_u10(SrcOperator):
class Acos_u10 (line 1415) | class Acos_u10(SrcOperator):
class Atan_u10 (line 1425) | class Atan_u10(SrcOperator):
class Atan2_u10 (line 1434) | class Atan2_u10(SrcOperator):
class Log_u10 (line 1444) | class Log_u10(SrcOperator):
class Cbrt_u10 (line 1454) | class Cbrt_u10(SrcOperator):
class Exp_u10 (line 1463) | class Exp_u10(SrcOperator):
class Pow_u10 (line 1473) | class Pow_u10(SrcOperator):
class Sinh_u10 (line 1482) | class Sinh_u10(SrcOperator):
class Cosh_u10 (line 1491) | class Cosh_u10(SrcOperator):
class Tanh_u10 (line 1500) | class Tanh_u10(SrcOperator):
class Sinh_u35 (line 1509) | class Sinh_u35(SrcOperator):
class Cosh_u35 (line 1518) | class Cosh_u35(SrcOperator):
class Tanh_u35 (line 1527) | class Tanh_u35(SrcOperator):
class Asinh_u10 (line 1536) | class Asinh_u10(SrcOperator):
class Acosh_u10 (line 1545) | class Acosh_u10(SrcOperator):
class Atanh_u10 (line 1555) | class Atanh_u10(SrcOperator):
class Exp2_u10 (line 1565) | class Exp2_u10(SrcOperator):
class Exp2_u35 (line 1575) | class Exp2_u35(SrcOperator):
class Exp10_u10 (line 1585) | class Exp10_u10(SrcOperator):
class Exp10_u35 (line 1595) | class Exp10_u35(SrcOperator):
class Expm1_u10 (line 1605) | class Expm1_u10(SrcOperator):
class Log10_u10 (line 1615) | class Log10_u10(SrcOperator):
class Log2_u10 (line 1625) | class Log2_u10(SrcOperator):
class Log2_u35 (line 1635) | class Log2_u35(SrcOperator):
class Log1p_u10 (line 1645) | class Log1p_u10(SrcOperator):
class Sinpi_u05 (line 1655) | class Sinpi_u05(SrcOperator):
class Cospi_u05 (line 1664) | class Cospi_u05(SrcOperator):
class Hypot_u05 (line 1673) | class Hypot_u05(SrcOperator):
class Hypot_u35 (line 1682) | class Hypot_u35(SrcOperator):
class Remainder (line 1691) | class Remainder(SrcOperator):
class Fmod (line 1700) | class Fmod(SrcOperator):
class Lgamma_u10 (line 1709) | class Lgamma_u10(SrcOperator):
class Tgamma_u10 (line 1719) | class Tgamma_u10(SrcOperator):
class Erf_u10 (line 1729) | class Erf_u10(SrcOperator):
class Erfc_u15 (line 1738) | class Erfc_u15(SrcOperator):
FILE: egg/platform_arm.py
function neon_typ (line 45) | def neon_typ(typ):
function half_neon64_typ (line 49) | def half_neon64_typ(typ):
function sve_typ (line 54) | def sve_typ(typ):
function suf (line 58) | def suf(typ):
function convert_from_predicate (line 69) | def convert_from_predicate(opts, op):
function convert_to_predicate (line 78) | def convert_to_predicate(opts, op):
function get_simd_exts (line 91) | def get_simd_exts():
function get_prev_simd_ext (line 95) | def get_prev_simd_ext(simd_ext):
function emulate_fp16 (line 102) | def emulate_fp16(simd_ext):
function get_type (line 110) | def get_type(opts, simd_ext, typ, nsimd_typ):
function get_logical_type (line 137) | def get_logical_type(opts, simd_ext, typ, nsimd_typ):
function get_nb_registers (line 177) | def get_nb_registers(simd_ext):
function get_native_soa_typ (line 186) | def get_native_soa_typ(simd_ext, typ, deg):
function get_SoA_type (line 195) | def get_SoA_type(simd_ext, typ, deg, nsimd_typ):
function has_compatible_SoA_types (line 203) | def has_compatible_SoA_types(simd_ext):
function get_additional_include (line 210) | def get_additional_include(func, platform, simd_ext):
function emulate_op1 (line 294) | def emulate_op1(op, simd_ext, typ):
function emulate_op2 (line 317) | def emulate_op2(op, simd_ext, typ):
function emulate_lop2_neon (line 339) | def emulate_lop2_neon(opts, op, simd_ext, typ):
function emulate_op3_neon (line 353) | def emulate_op3_neon(op, simd_ext, typ):
function emulate_f64_neon (line 368) | def emulate_f64_neon(simd_ext, op, params):
function f16f64 (line 395) | def f16f64(simd_ext, typ, op, armop, arity, forced_intrinsics = ''):
function max_len (line 425) | def max_len(simd_ext, typ):
function real_len (line 433) | def real_len(simd_ext, typ):
function load1234 (line 442) | def load1234(opts, simd_ext, typ, deg):
function maskoz_load (line 539) | def maskoz_load(oz, simd_ext, typ):
function store1234 (line 607) | def store1234(opts, simd_ext, typ, deg):
function mask_store (line 703) | def mask_store(simd_ext, typ):
function len1 (line 747) | def len1(simd_ext, typ):
function addsub (line 759) | def addsub(op, simd_ext, typ):
function mul2 (line 773) | def mul2(simd_ext, typ):
function div2 (line 789) | def div2(simd_ext, typ):
function binop2 (line 805) | def binop2(op, simd_ext, typ):
function not1 (line 842) | def not1(simd_ext, typ):
function lop2 (line 878) | def lop2(opts, op, simd_ext, typ):
function lnot1 (line 922) | def lnot1(opts, simd_ext, typ):
function sqrt1 (line 956) | def sqrt1(simd_ext, typ):
function shl_shr (line 978) | def shl_shr(op, simd_ext, typ):
function shra (line 1005) | def shra(simd_ext, typ):
function set1 (line 1026) | def set1(simd_ext, typ):
function lset1 (line 1051) | def lset1(simd_ext, typ):
function cmp2 (line 1085) | def cmp2(opts, op, simd_ext, typ):
function neq2 (line 1135) | def neq2(opts, simd_ext, typ):
function if_else3 (line 1149) | def if_else3(opts, simd_ext, typ):
function minmax2 (line 1187) | def minmax2(op, simd_ext, typ):
function abs1 (line 1211) | def abs1(simd_ext, typ):
function round1 (line 1231) | def round1(op, simd_ext, typ):
function fmafnma3 (line 1257) | def fmafnma3(op, simd_ext, typ):
function fmsfnms3 (line 1296) | def fmsfnms3(op, simd_ext, typ):
function neg1 (line 1315) | def neg1(simd_ext, typ):
function recs1 (line 1356) | def recs1(op, simd_ext, typ):
function loadl (line 1402) | def loadl(aligned, simd_ext, typ):
function storel (line 1415) | def storel(aligned, simd_ext, typ):
function allany1 (line 1431) | def allany1(opts, op, simd_ext, typ):
function nbtrue1 (line 1482) | def nbtrue1(opts, simd_ext, typ):
function reinterpretl1 (line 1528) | def reinterpretl1(simd_ext, from_typ, to_typ):
function convert1 (line 1592) | def convert1(simd_ext, from_typ, to_typ):
function reinterpret1 (line 1671) | def reinterpret1(simd_ext, from_typ, to_typ):
function reverse1 (line 1749) | def reverse1(simd_ext, typ):
function addv (line 1776) | def addv(simd_ext, typ):
function upcvt1 (line 1844) | def upcvt1(simd_ext, from_typ, to_typ):
function downcvt1 (line 1948) | def downcvt1(simd_ext, from_typ, to_typ):
function adds (line 2015) | def adds(simd_ext, from_typ):
function subs (line 2027) | def subs(simd_ext, from_typ):
function to_mask1 (line 2039) | def to_mask1(opts, simd_ext, typ):
function iota (line 2078) | def iota(simd_ext, typ):
function mask_for_loop_tail (line 2113) | def mask_for_loop_tail(simd_ext, typ):
function to_logical1 (line 2139) | def to_logical1(opts, simd_ext, typ):
function zip_unzip_half (line 2170) | def zip_unzip_half(func, simd_ext, typ):
function zip_unzip (line 2253) | def zip_unzip(func, simd_ext, typ):
function gather (line 2307) | def gather(simd_ext, typ):
function gather_linear (line 2365) | def gather_linear(simd_ext, typ):
function maskoz_gather (line 2413) | def maskoz_gather(oz, simd_ext, typ):
function scatter (line 2513) | def scatter(simd_ext, typ):
function scatter_linear (line 2562) | def scatter_linear(simd_ext, typ):
function mask_scatter (line 2601) | def mask_scatter(simd_ext, typ):
function get_impl (line 2666) | def get_impl(opts, func, simd_ext, from_typ, to_typ):
FILE: egg/platform_cpu.py
function get_nb_el (line 40) | def get_nb_el(typ):
function get_simd_exts (line 46) | def get_simd_exts():
function get_prev_simd_ext (line 49) | def get_prev_simd_ext(simd_ext):
function get_simd_strings (line 54) | def get_simd_strings(simd_ext):
function emulate_fp16 (line 60) | def emulate_fp16(simd_ext):
function get_type (line 65) | def get_type(opts, simd_ext, typ, nsimd_typ):
function get_logical_type (line 75) | def get_logical_type(opts, simd_ext, typ, nsimd_typ):
function get_nb_registers (line 84) | def get_nb_registers(simd_ext):
function has_compatible_SoA_types (line 89) | def has_compatible_SoA_types(simd_ext):
function get_additional_include (line 94) | def get_additional_include(func, platform, simd_ext):
function repeat_stmt (line 117) | def repeat_stmt(fmt, typ):
function func_body (line 122) | def func_body(fmt, typ2, logical = False):
function op2 (line 130) | def op2(op, typ):
function lop2 (line 137) | def lop2(op, typ):
function landnot2 (line 143) | def landnot2(typ):
function lnot1 (line 149) | def lnot1(typ):
function scalar_impl (line 155) | def scalar_impl(func, typ, arity):
function cmp2 (line 168) | def cmp2(op, typ):
function set1 (line 178) | def set1(typ):
function set1l (line 190) | def set1l(typ):
function load (line 196) | def load(typ):
function maskoz_load (line 210) | def maskoz_load(oz, typ):
function load_deg234 (line 230) | def load_deg234(typ, deg):
function store_deg234 (line 248) | def store_deg234(typ, deg):
function loadl (line 266) | def loadl(typ):
function store (line 283) | def store(typ):
function mask_store (line 294) | def mask_store(typ):
function storel (line 307) | def storel(typ):
function if_else1 (line 323) | def if_else1(typ):
function all_any (line 332) | def all_any(typ, func):
function reinterpret1 (line 347) | def reinterpret1(from_typ, to_typ):
function reinterpretl1 (line 358) | def reinterpretl1(from_typ, to_typ):
function convert1 (line 364) | def convert1(from_typ, to_typ):
function nbtrue1 (line 374) | def nbtrue1(typ):
function reverse1 (line 383) | def reverse1(typ):
function addv1 (line 394) | def addv1(typ):
function upcvt1 (line 404) | def upcvt1(from_typ, to_typ):
function downcvt2 (line 421) | def downcvt2(from_typ, to_typ):
function len1 (line 438) | def len1(typ):
function to_logical1 (line 443) | def to_logical1(typ):
function to_mask1 (line 463) | def to_mask1(typ):
function zip_half (line 486) | def zip_half(func, typ):
function unzip_half (line 503) | def unzip_half(func, typ):
function zip (line 524) | def zip(from_typ):
function unzip (line 530) | def unzip(from_typ):
function mask_for_loop_tail (line 538) | def mask_for_loop_tail(typ):
function iota (line 545) | def iota(typ):
function gather (line 552) | def gather(typ):
function gather_linear (line 562) | def gather_linear(typ):
function maskoz_gather (line 572) | def maskoz_gather(op, typ):
function scatter (line 591) | def scatter(typ):
function scatter_linear (line 601) | def scatter_linear(typ):
function mask_scatter (line 611) | def mask_scatter(typ):
function get_impl (line 623) | def get_impl(opts, func, simd_ext, from_typ, to_typ=''):
FILE: egg/platform_ppc.py
function has_to_be_emulated (line 36) | def has_to_be_emulated(simd_ext, typ):
function native_type (line 44) | def native_type(typ):
function native_typel (line 69) | def native_typel(typ):
function get_len (line 82) | def get_len(typ):
function emulate_64 (line 86) | def emulate_64(op, typ, params):
function emulate_f16 (line 112) | def emulate_f16(op, simd_ext, params):
function emulation_code (line 124) | def emulation_code(op, simd_ext, typ, params):
function emulate_with_scalar (line 133) | def emulate_with_scalar(op, simd_ext, typ, params):
function emulate_fp16 (line 170) | def emulate_fp16(simd_ext):
function get_simd_exts (line 173) | def get_simd_exts():
function get_type (line 176) | def get_type(opts, simd_ext, typ, nsimd_typ):
function get_logical_type (line 189) | def get_logical_type(opts, simd_ext, typ, nsimd_typ):
function get_nb_registers (line 202) | def get_nb_registers(simd_ext):
function has_compatible_SoA_types (line 210) | def has_compatible_SoA_types(simd_ext):
function get_additional_include (line 216) | def get_additional_include(func, platform, simd_ext):
function printf2 (line 316) | def printf2(*args0):
function load1234 (line 385) | def load1234(simd_ext, typ, deg, aligned):
function store1234 (line 624) | def store1234(simd_ext, typ, deg, aligned):
function len1 (line 803) | def len1(simd_ext, typ):
function simple_op2 (line 809) | def simple_op2(op, simd_ext, typ):
function binary_op2 (line 815) | def binary_op2(op, simd_ext, typ):
function logical_op2 (line 823) | def logical_op2(op, simd_ext, typ):
function div2 (line 831) | def div2(simd_ext, typ):
function not1 (line 850) | def not1(simd_ext, typ):
function lnot1 (line 857) | def lnot1(simd_ext, typ):
function sqrt1 (line 864) | def sqrt1(simd_ext, typ):
function shift2 (line 871) | def shift2(op, simd_ext, typ):
function set1 (line 880) | def set1(simd_ext, typ):
function lset1 (line 897) | def lset1(simd_ext, typ):
function cmp2 (line 918) | def cmp2(op, simd_ext, typ):
function if_else3 (line 933) | def if_else3(simd_ext, typ):
function minmax2 (line 945) | def minmax2(op, simd_ext, typ):
function abs1 (line 952) | def abs1(simd_ext, typ):
function round1 (line 961) | def round1(op, simd_ext, typ):
function fma (line 973) | def fma(op, simd_ext, typ):
function neg1 (line 997) | def neg1(simd_ext, typ):
function recs1 (line 1007) | def recs1(op, simd_ext, typ):
function loadl (line 1020) | def loadl(aligned, simd_ext, typ):
function storel (line 1032) | def storel(aligned, simd_ext, typ):
function allany1 (line 1047) | def allany1(op, simd_ext, typ):
function nbtrue1 (line 1061) | def nbtrue1(simd_ext, typ):
function reinterpretl1 (line 1077) | def reinterpretl1(simd_ext, from_typ, to_typ):
function convert1 (line 1132) | def convert1(simd_ext, from_typ, to_typ):
function reinterpret1 (line 1177) | def reinterpret1(simd_ext, from_typ, to_typ):
function reverse1 (line 1277) | def reverse1(simd_ext, typ):
function addv (line 1308) | def addv(simd_ext, typ):
function add_sub_s (line 1322) | def add_sub_s(op, simd_ext, typ):
function upcvt1 (line 1339) | def upcvt1(simd_ext, from_typ, to_typ):
function downcvt1 (line 1424) | def downcvt1(simd_ext, from_typ, to_typ):
function unzip (line 1469) | def unzip(func, simd_ext, typ):
function zip (line 1503) | def zip(op, simd_ext, typ):
function zip_unzip_basic (line 1527) | def zip_unzip_basic(op, simd_ext, typ):
function to_mask (line 1536) | def to_mask(simd_ext, typ):
function iota (line 1557) | def iota(simd_ext, typ):
function mask_for_loop_tail (line 1582) | def mask_for_loop_tail(simd_ext, typ):
function scatter (line 1602) | def scatter(simd_ext, typ):
function gather (line 1632) | def gather(simd_ext, typ):
function gather_linear (line 1666) | def gather_linear(simd_ext, typ):
function scatter_linear (line 1693) | def scatter_linear(simd_ext, typ):
function maskoz_load (line 1713) | def maskoz_load(oz, simd_ext, typ):
function mask_store (line 1770) | def mask_store(simd_ext, typ):
function to_logical (line 1811) | def to_logical(simd_ext, typ):
function get_impl (line 1838) | def get_impl(opts, func, simd_ext, from_typ, to_typ):
FILE: egg/platform_x86.py
function get_simd_exts (line 41) | def get_simd_exts():
function get_prev_simd_ext (line 45) | def get_prev_simd_ext(simd_ext):
function emulate_fp16 (line 59) | def emulate_fp16(simd_ext):
function get_native_typ (line 65) | def get_native_typ(simd_ext, typ):
function get_type (line 83) | def get_type(opts, simd_ext, typ, nsimd_typ):
function get_logical_type (line 94) | def get_logical_type(opts, simd_ext, typ, nsimd_typ):
function get_nb_registers (line 109) | def get_nb_registers(simd_ext):
function has_compatible_SoA_types (line 118) | def has_compatible_SoA_types(simd_ext):
function get_additional_include (line 125) | def get_additional_include(func, platform, simd_ext):
function pre (line 296) | def pre(simd_ext):
function suf_ep (line 308) | def suf_ep(typ):
function nbits (line 320) | def nbits(simd_ext):
function suf_si (line 328) | def suf_si(simd_ext, typ):
function castsi (line 348) | def castsi(simd_ext, typ):
function extract (line 354) | def extract(simd_ext, typ, lohi, var):
function setr (line 391) | def setr(simd_ext, typ, var1, var2):
function set_lane (line 415) | def set_lane(simd_ext, typ, var_name, scalar, i):
function get_lane (line 496) | def get_lane(simd_ext, typ, var_name, i):
function get_undefined (line 554) | def get_undefined(simd_ext, typ):
function get_emulation_code (line 566) | def get_emulation_code(func, signature, simd_ext, typ):
function how_it_should_be_op2 (line 619) | def how_it_should_be_op2(func, simd_ext, typ):
function split_opn (line 629) | def split_opn(func, simd_ext, typ, n):
function split_op2 (line 650) | def split_op2(func, simd_ext, typ):
function emulate_op2 (line 653) | def emulate_op2(opts, op, simd_ext, typ):
function emulate_op1 (line 657) | def emulate_op1(opts, func, simd_ext, typ):
function split_cmp2 (line 660) | def split_cmp2(func, simd_ext, typ):
function f16_cmp2 (line 711) | def f16_cmp2(func, simd_ext):
function cmp2_with_add (line 717) | def cmp2_with_add(func, simd_ext, typ):
function load (line 733) | def load(simd_ext, typ, aligned):
function maskoz_load (line 802) | def maskoz_load(simd_ext, typ, oz, aligned):
function load_deg234 (line 916) | def load_deg234(simd_ext, typ, align, deg):
function store_deg234 (line 956) | def store_deg234(simd_ext, typ, align, deg):
function store (line 997) | def store(simd_ext, typ, aligned):
function mask_store (line 1057) | def mask_store(simd_ext, typ, aligned):
function binop2 (line 1141) | def binop2(func, simd_ext, typ, logical=False):
function binlop2 (line 1176) | def binlop2(func, simd_ext, typ):
function andnot2 (line 1220) | def andnot2(simd_ext, typ, logical=False):
function landnot2 (line 1252) | def landnot2(simd_ext, typ):
function not1 (line 1266) | def not1(simd_ext, typ, logical=False):
function lnot1 (line 1287) | def lnot1(simd_ext, typ):
function addsub (line 1301) | def addsub(func, simd_ext, typ):
function len1 (line 1314) | def len1(simd_ext, typ):
function div2 (line 1320) | def div2(opts, simd_ext, typ):
function mul2 (line 1328) | def mul2(opts, simd_ext, typ):
function shl_shr (line 1387) | def shl_shr(func, simd_ext, typ):
function shra (line 1455) | def shra(opts, simd_ext, typ):
function set1 (line 1512) | def set1(simd_ext, typ):
function set1l (line 1535) | def set1l(simd_ext, typ):
function eq2 (line 1564) | def eq2(simd_ext, typ):
function neq2 (line 1607) | def neq2(simd_ext, typ):
function gt2 (line 1634) | def gt2(simd_ext, typ):
function lt2 (line 1687) | def lt2(simd_ext, typ):
function geq2 (line 1694) | def geq2(simd_ext, typ):
function leq2 (line 1726) | def leq2(simd_ext, typ):
function if_else1 (line 1756) | def if_else1(simd_ext, typ):
function minmax (line 1808) | def minmax(func, simd_ext, typ):
function sqrt1 (line 1848) | def sqrt1(simd_ext, typ):
function loadl (line 1859) | def loadl(simd_ext, typ, aligned):
function storel (line 1892) | def storel(simd_ext, typ, aligned):
function abs1 (line 1929) | def abs1(simd_ext, typ):
function fma_fms (line 1999) | def fma_fms(func, simd_ext, typ):
function round1 (line 2042) | def round1(opts, func, simd_ext, typ):
function trunc1 (line 2061) | def trunc1(opts, simd_ext, typ):
function round_to_even1 (line 2089) | def round_to_even1(opts, simd_ext, typ):
function all_any (line 2111) | def all_any(func, simd_ext, typ):
function reinterpret1 (line 2157) | def reinterpret1(simd_ext, from_typ, to_typ):
function reinterpretl1 (line 2232) | def reinterpretl1(simd_ext, from_typ, to_typ):
function convert1 (line 2294) | def convert1(simd_ext, from_typ, to_typ):
function rec11_rsqrt11 (line 2382) | def rec11_rsqrt11(func, simd_ext, typ):
function rec1 (line 2411) | def rec1(simd_ext, typ):
function neg1 (line 2425) | def neg1(simd_ext, typ):
function nbtrue1 (line 2447) | def nbtrue1(simd_ext, typ):
function reverse1 (line 2487) | def reverse1(simd_ext, typ):
function addv (line 2680) | def addv(simd_ext, typ):
function upcvt1 (line 2776) | def upcvt1(simd_ext, from_typ, to_typ):
function downcvt1 (line 2943) | def downcvt1(opts, simd_ext, from_typ, to_typ):
function adds_subs_intrinsic_instructions_i8_i16_u8_u16 (line 3029) | def adds_subs_intrinsic_instructions_i8_i16_u8_u16(which_op, simd_ext, t...
function get_avx512_sse2_i32_i64_dependent_code (line 3050) | def get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ):
function adds (line 3084) | def adds(simd_ext, typ):
function subs (line 3216) | def subs(simd_ext, typ):
function to_mask1 (line 3251) | def to_mask1(simd_ext, typ):
function to_logical1 (line 3293) | def to_logical1(simd_ext, typ):
function zip_half (line 3314) | def zip_half(func, simd_ext, typ):
function zip (line 3410) | def zip(simd_ext, typ):
function unzip_half (line 3420) | def unzip_half(opts, func, simd_ext, typ):
function unzip (line 3603) | def unzip(simd_ext, typ):
function mask_for_loop_tail (line 3612) | def mask_for_loop_tail(simd_ext, typ):
function iota (line 3634) | def iota(simd_ext, typ):
function scatter (line 3653) | def scatter(simd_ext, typ):
function scatter_linear (line 3682) | def scatter_linear(simd_ext, typ):
function mask_scatter (line 3724) | def mask_scatter(simd_ext, typ):
function gather (line 3783) | def gather(simd_ext, typ):
function gather_linear (line 3828) | def gather_linear(simd_ext, typ):
function maskoz_gather (line 3878) | def maskoz_gather(oz, simd_ext, typ):
function get_impl (line 3978) | def get_impl(opts, func, simd_ext, from_typ, to_typ):
FILE: egg/rocm.py
function get_impl (line 25) | def get_impl(operator, totyp, typ):
FILE: egg/scalar.py
function opnum (line 27) | def opnum(func, typ):
function cmp (line 45) | def cmp(func, typ):
function opbit (line 63) | def opbit(func, typ):
function shift (line 78) | def shift(func, typ):
function libm_opn (line 107) | def libm_opn(func, arity, typ, until_cpp11, c89_code):
function round_to_even (line 176) | def round_to_even(typ):
function reinterpret (line 219) | def reinterpret(totyp, typ):
function cvt (line 273) | def cvt(totyp, typ):
function adds (line 292) | def adds(typ):
function subs (line 326) | def subs(typ):
function get_impl (line 342) | def get_impl(operator, totyp, typ):
FILE: egg/x86_load_store_deg234.py
function perm64 (line 31) | def perm64(var1, var2, ind1, ind2):
function get_load_v0v1 (line 38) | def get_load_v0v1(simd_ext, typ, align, fmtspec):
function load2_sse (line 51) | def load2_sse(simd_ext, typ, align, fmtspec2):
function load2_avx (line 123) | def load2_avx(simd_ext, typ, align, fmtspec2):
function load2_avx512 (line 276) | def load2_avx512(simd_ext, typ, align, fmtspec2):
function store2 (line 368) | def store2(simd_ext, typ, align, fmtspec2):
function get_load_v0v1v2v3 (line 532) | def get_load_v0v1v2v3(simd_ext, typ, align, fmtspec):
function load4_sse (line 549) | def load4_sse(simd_ext, typ, align, fmtspec2):
function load4_avx (line 652) | def load4_avx(simd_ext, typ, align, fmtspec2):
function load4_avx512 (line 938) | def load4_avx512(simd_ext, typ, align, fmtspec2):
function store4 (line 1125) | def store4(simd_ext, typ, align, fmtspec2, v0, v1, v2, v3):
function store4_sse (line 1149) | def store4_sse(typ, align, fmtspec2):
function store4_avx (line 1204) | def store4_avx(simd_ext, typ, align, fmtspec2):
function store4_avx512 (line 1414) | def store4_avx512(simd_ext, typ, align, fmtspec2):
function get_load_v0v1v2 (line 1564) | def get_load_v0v1v2(simd_ext, typ, align, fmtspec):
function load3_sse (line 1579) | def load3_sse(simd_ext, typ, align, fmtspec2):
function store3 (line 1730) | def store3(simd_ext, typ, align, fmtspec2, v0, v1, v2):
function store3_sse (line 1751) | def store3_sse(simd_ext, typ, align, fmtspec2):
function load3_avx (line 1954) | def load3_avx(simd_ext, typ, align, fmtspec2):
function store3_avx (line 2409) | def store3_avx(simd_ext, typ, align, fmtspec2):
function load3_avx512 (line 2811) | def load3_avx512(simd_ext, typ, align, fmtspec2):
function store3_avx512 (line 3115) | def store3_avx512(simd_ext, typ, align, fmtspec2):
FILE: examples/module_fixed_point.cpp
function rand_float (line 26) | float rand_float() {
function main (line 30) | int main() {
FILE: examples/tutorial.cpp
function uppercase_scalar (line 8) | void uppercase_scalar(T *dst, const T *src, int n) {
function uppercase_simd (line 19) | void uppercase_simd(T *dst, const T *src, int n) {
function main (line 40) | int main(int argc, char **argv) {
FILE: include/nsimd/c_adv_api.h
function NSIMD_INLINE (line 32) | NSIMD_INLINE void nsimd_c11_type_unsupported(void) {}
FILE: include/nsimd/cxx_adv_api.hpp
type nsimd (line 33) | namespace nsimd {
type nsimd_static_assert (line 38) | struct nsimd_static_assert
type nsimd_static_assert<true> (line 39) | struct nsimd_static_assert<true> {}
function NSIMD_STRUCT (line 49) | NSIMD_STRUCT pack<T, 1, SimdExt> {
function NSIMD_STRUCT (line 101) | NSIMD_STRUCT pack {
type is_pack_t (line 138) | struct is_pack_t : public std::false_type {}
type is_pack_t<pack<T, N, SimdExt> > (line 141) | struct is_pack_t<pack<T, N, SimdExt> > : public std::true_type {}
function NSIMD_STRUCT (line 157) | NSIMD_STRUCT packl<T, 1, SimdExt> {
function NSIMD_STRUCT (line 196) | NSIMD_STRUCT packl {
type is_packl_t (line 220) | struct is_packl_t : public std::false_type {}
type is_packl_t<packl<T, N, SimdExt> > (line 223) | struct is_packl_t<packl<T, N, SimdExt> > : public std::true_type {}
function NSIMD_STRUCT (line 239) | NSIMD_STRUCT packx1<T, 1, SimdExt> {
function NSIMD_STRUCT (line 254) | NSIMD_STRUCT packx1 {
type is_packx1_t (line 273) | struct is_packx1_t : public std::false_type {}
type is_packx1_t<packx1<T, N, SimdExt> > (line 276) | struct is_packx1_t<packx1<T, N, SimdExt> > : public std::true_type {}
function NSIMD_STRUCT (line 292) | NSIMD_STRUCT packx2<T, 1, SimdExt> {
function NSIMD_STRUCT (line 309) | NSIMD_STRUCT packx2 {
type is_packx2_t (line 332) | struct is_packx2_t : public std::false_type {}
type is_packx2_t<packx2<T, N, SimdExt> > (line 335) | struct is_packx2_t<packx2<T, N, SimdExt> > : public std::true_type {}
function NSIMD_STRUCT (line 351) | NSIMD_STRUCT packx3<T, 1, SimdExt> {
function NSIMD_STRUCT (line 370) | NSIMD_STRUCT packx3 {
type is_packx3_t (line 397) | struct is_packx3_t : public std::false_type {}
type is_packx3_t<packx3<T, N, SimdExt> > (line 400) | struct is_packx3_t<packx3<T, N, SimdExt> > : public std::true_type {}
function NSIMD_STRUCT (line 416) | NSIMD_STRUCT packx4<T, 1, SimdExt> {
function NSIMD_STRUCT (line 438) | NSIMD_STRUCT packx4 {
type is_packx4_t (line 469) | struct is_packx4_t : public std::false_type {}
type is_packx4_t<packx4<T, N, SimdExt> > (line 472) | struct is_packx4_t<packx4<T, N, SimdExt> > : public std::true_type {}
function len (line 496) | int len(pack<T, N, SimdExt> const &) {
function len (line 501) | int len(packl<T, N, SimdExt> const &) {
function len (line 506) | int len(packx1<T, N, SimdExt> const &) {
function len (line 511) | int len(packx2<T, N, SimdExt> const &) {
function len (line 516) | int len(packx3<T, N, SimdExt> const &) {
function len (line 521) | int len(packx4<T, N, SimdExt> const &) {
function len (line 525) | int len() { return len(Pack()); }
function T (line 531) | T addv(pack<T, 1, SimdExt> const &a0) {
function T (line 536) | T addv(pack<T, N, SimdExt> const &a0) {
function all (line 544) | int all(packl<T, 1, SimdExt> const &a0) {
function all (line 549) | int all(packl<T, N, SimdExt> const &a0) {
function any (line 557) | int any(packl<T, 1, SimdExt> const &a0) {
function any (line 562) | int any(packl<T, N, SimdExt> const &a0) {
function nbtrue (line 570) | int nbtrue(packl<T, 1, SimdExt> const &a0) {
function nbtrue (line 575) | int nbtrue(packl<T, N, SimdExt> const &a0) {
function if_else (line 737) | pack<T, 1, SimdExt>
function if_else (line 748) | pack<T, N, SimdExt>
function mask_storea (line 763) | void mask_storea(packl<L, N, SimdExt> const &a0, T *a1,
function mask_storeu (line 771) | void mask_storeu(packl<L, N, SimdExt> const &a0, T *a1,
function maskz_loada (line 779) | pack<T, N, SimdExt> maskz_loada(packl<L, N, SimdExt> const &a0, const T ...
function maskz_loadu (line 786) | pack<T, N, SimdExt> maskz_loadu(packl<L, N, SimdExt> const &a0, const T ...
function masko_loada (line 793) | pack<T, N, SimdExt> masko_loada(packl<L, N, SimdExt> const &a0, const T ...
function masko_loadu (line 801) | pack<T, N, SimdExt> masko_loadu(packl<L, N, SimdExt> const &a0, const T ...
type loadz_return_t (line 811) | struct loadz_return_t {
type load_helper (line 817) | struct load_helper {}
type load_helper<SimdVector, aligned> (line 820) | struct load_helper<SimdVector, aligned> {
method SimdVector (line 825) | static SimdVector load(const T *a0) { return loada<SimdVector>(a0); }
method SimdVector (line 826) | static SimdVector loadl(const T *a0) { return loadla<SimdVector>(a0); }
method SimdVector (line 827) | static SimdVector load2(const T *a0) { return load2a<SimdVector>(a0); }
method SimdVector (line 828) | static SimdVector load3(const T *a0) { return load3a<SimdVector>(a0); }
method SimdVector (line 829) | static SimdVector load4(const T *a0) { return load4a<SimdVector>(a0); }
method SimdVector (line 831) | static SimdVector maskz_load(packl<T, N, simd_ext> const &a0, const T ...
method masko_load (line 835) | static pack<T, N, simd_ext> masko_load(packl<T, N, simd_ext> const &a0,
type load_helper<SimdVector, unaligned> (line 842) | struct load_helper<SimdVector, unaligned> {
method SimdVector (line 847) | static SimdVector load(const T *a0) { return loadu<SimdVector>(a0); }
method SimdVector (line 848) | static SimdVector loadl(const T *a0) { return loadlu<SimdVector>(a0); }
method SimdVector (line 849) | static SimdVector load2(const T *a0) { return load2u<SimdVector>(a0); }
method SimdVector (line 850) | static SimdVector load3(const T *a0) { return load3u<SimdVector>(a0); }
method SimdVector (line 851) | static SimdVector load4(const T *a0) { return load4u<SimdVector>(a0); }
method SimdVector (line 853) | static SimdVector maskz_load(packl<T, N, simd_ext> const &a0, const T ...
method masko_load (line 857) | static pack<T, N, simd_ext> masko_load(packl<T, N, simd_ext> const &a0,
type store_helper (line 864) | struct store_helper {}
function store (line 869) | static void store(NSIMD_T *a0, P const &a1) {
function storel (line 881) | static void storel(NSIMD_T *a0, P const &a1) {
function store2 (line 886) | static void store2(NSIMD_T *a0, P const &a1, P const &a2) {
function store3 (line 891) | static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) {
function store4 (line 896) | static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3,
function store (line 903) | static void store(NSIMD_T *a0, P const &a1) {
function storel (line 915) | static void storel(NSIMD_T *a0, P const &a1) {
function store2 (line 920) | static void store2(NSIMD_T *a0, P const &a1, P const &a2) {
function store3 (line 925) | static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) {
function store4 (line 930) | static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3,
function SimdVector (line 941) | SimdVector load(const typename SimdVector::value_type *ptr) {
function maskz_load (line 946) | pack<typename Packl::value_type, Packl::unroll, typename Packl::simd_ext>
function Pack (line 955) | Pack masko_load(Packl const &pl, const typename Pack::value_type *ptr,
function SimdVector (line 961) | SimdVector loadl(const typename SimdVector::value_type *ptr) {
function SimdVector (line 966) | SimdVector load2(const typename SimdVector::value_type *ptr) {
function SimdVector (line 971) | SimdVector load3(const typename SimdVector::value_type *ptr) {
function SimdVector (line 976) | SimdVector load4(const typename SimdVector::value_type *ptr) {
function store (line 981) | void store(typename Pack::value_type *ptr, Pack const &p) {
function mask_store (line 987) | void mask_store(Packl const &pl, typename Pack::value_type *ptr,
function storel (line 993) | void storel(typename Packl::value_type *ptr, Packl const &pl) {
function store2 (line 998) | void store2(typename Pack::value_type *ptr, Pack const &p1, Pack const &...
function store3 (line 1003) | void store3(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2,
function store4 (line 1009) | void store4(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2,
function T (line 1016) | T native_register(T a) { return a; }
function native_register (line 1019) | typename pack<T, 1, SimdExt>::simd_vector
type get_pack_helper (line 1029) | struct get_pack_helper {}
type get_pack_helper<T, N, SimdExt, packx1, Ix> (line 1036) | struct get_pack_helper<T, N, SimdExt, packx1, Ix> {}
type get_pack_helper<T, N, SimdExt, packx1, 0> (line 1039) | struct get_pack_helper<T, N, SimdExt, packx1, 0> {
type get_pack_helper<T, N, SimdExt, packx2, Ix> (line 1051) | struct get_pack_helper<T, N, SimdExt, packx2, Ix> {}
type get_pack_helper<T, N, SimdExt, packx2, 0> (line 1054) | struct get_pack_helper<T, N, SimdExt, packx2, 0> {
type get_pack_helper<T, N, SimdExt, packx2, 1> (line 1062) | struct get_pack_helper<T, N, SimdExt, packx2, 1> {
type get_pack_helper<T, N, SimdExt, packx3, Ix> (line 1074) | struct get_pack_helper<T, N, SimdExt, packx3, Ix> {}
type get_pack_helper<T, N, SimdExt, packx3, 0> (line 1077) | struct get_pack_helper<T, N, SimdExt, packx3, 0> {
type get_pack_helper<T, N, SimdExt, packx3, 1> (line 1085) | struct get_pack_helper<T, N, SimdExt, packx3, 1> {
type get_pack_helper<T, N, SimdExt, packx3, 2> (line 1093) | struct get_pack_helper<T, N, SimdExt, packx3, 2> {
type get_pack_helper<T, N, SimdExt, packx4, Ix> (line 1105) | struct get_pack_helper<T, N, SimdExt, packx4, Ix> {}
type get_pack_helper<T, N, SimdExt, packx4, 0> (line 1108) | struct get_pack_helper<T, N, SimdExt, packx4, 0> {
type get_pack_helper<T, N, SimdExt, packx4, 1> (line 1116) | struct get_pack_helper<T, N, SimdExt, packx4, 1> {
type get_pack_helper<T, N, SimdExt, packx4, 2> (line 1124) | struct get_pack_helper<T, N, SimdExt, packx4, 2> {
type get_pack_helper<T, N, SimdExt, packx4, 3> (line 1132) | struct get_pack_helper<T, N, SimdExt, packx4, 3> {
function get_pack (line 1145) | pack<T, N, SimdExt> get_pack(const pack<T, N, SimdExt> &pack_) {
function get_pack (line 1157) | pack<T, N, SimdExt> get_pack(const packx<T, N, SimdExt> &packx_) {
type to_pack_trait (line 1164) | struct to_pack_trait {}
type to_pack_trait<_packx<T, N, SimdExt> > (line 1168) | struct to_pack_trait<_packx<T, N, SimdExt> > {
function to_pack (line 1178) | pack<T, 1, SimdExt> to_pack(const pack<T, 1, SimdExt> &pack_) {
function to_pack (line 1183) | pack<T, N, SimdExt> to_pack(const pack<T, N, SimdExt> &pack_) {
function to_pack (line 1192) | pack<T, 1, SimdExt> to_pack(const packx1<T, 1, SimdExt> &packx_) {
function to_pack (line 1201) | pack<T, 2, SimdExt> to_pack(const packx2<T, 1, SimdExt> &packx_) {
function to_pack (line 1211) | pack<T, 3, SimdExt> to_pack(const packx3<T, 1, SimdExt> &packx_) {
function to_pack (line 1221) | pack<T, 4, SimdExt> to_pack(const packx4<T, 1, SimdExt> &packx_) {
type to_pack_recurs_helper (line 1240) | struct to_pack_recurs_helper {
method to_pack (line 1241) | static pack<T, to_pack_unroll_ix, SimdExt>
type to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_ix */,
1 /* to_pack_unroll_ix */, which_from_pack_ix,
SimdExt, packx> (line 1260) | struct to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_...
method to_pack (line 1263) | static pack<T, 1, SimdExt>
type to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_ix */,
to_pack_unroll_ix, which_from_pack_ix, SimdExt,
packx> (line 1278) | struct to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_...
method to_pack (line 1281) | static pack<T, to_pack_unroll_ix, SimdExt>
function to_pack (line 1300) | typename to_pack_trait<packx<T, N, SimdExt> >::value_type
function to_pack_interleave (line 1317) | pack<T, 1, SimdExt> to_pack_interleave(const pack<T, 1, SimdExt> &pack_) {
function to_pack_interleave (line 1322) | pack<T, N, SimdExt> to_pack_interleave(const pack<T, N, SimdExt> &pack_) {
function to_pack_interleave (line 1329) | pack<T, 1, SimdExt> to_pack_interleave(const packx1<T, 1, SimdExt> &pack...
function to_pack_interleave (line 1337) | pack<T, N, SimdExt>
function to_pack_interleave (line 1348) | pack<T, 2, SimdExt> to_pack_interleave(const packx2<T, 1, SimdExt> &pack...
function to_pack_interleave (line 1358) | pack<T, 2 * N, SimdExt>
function to_pack_interleave (line 1377) | pack<T, 3, SimdExt> to_pack_interleave(const packx3<T, 1, SimdExt> &pack...
function to_pack_interleave (line 1388) | pack<T, 3 * N, SimdExt>
function to_pack_interleave (line 1409) | pack<T, 4, SimdExt> to_pack_interleave(const packx4<T, 1, SimdExt> &pack...
function to_pack_interleave (line 1421) | pack<T, 4 * N, SimdExt>
FILE: include/nsimd/cxx_adv_api_aliases.hpp
type nsimd (line 30) | namespace nsimd {
function fabs (line 35) | pack<T, N, SimdExt> fabs(pack<T, N, SimdExt> const &a0) {
function fmin (line 42) | pack<T, N, SimdExt> fmin(pack<T, N, SimdExt> const &a0,
function fmax (line 50) | pack<T, N, SimdExt> fmax(pack<T, N, SimdExt> const &a0,
FILE: include/nsimd/modules/fixed_point.hpp
type nsimd (line 34) | namespace nsimd {
type fixed_point (line 35) | namespace fixed_point {
function len (line 43) | int len(const T &) { return fpsimd_n(T()); }
function len (line 45) | int len(const nsimd::fixed_point::pack<T> &) {
function NSIMD_STRUCT (line 49) | NSIMD_STRUCT pack {
function NSIMD_STRUCT (line 72) | NSIMD_STRUCT packl {
function NSIMD_INLINE (line 84) | NSIMD_INLINE pack<T> add(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 91) | NSIMD_INLINE pack<T> operator+(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 96) | NSIMD_INLINE pack<T> sub(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 103) | NSIMD_INLINE pack<T> operator-(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 108) | NSIMD_INLINE pack<T> mul(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 115) | NSIMD_INLINE pack<T> operator*(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 120) | NSIMD_INLINE pack<T> div(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 127) | NSIMD_INLINE pack<T> operator/(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 132) | NSIMD_INLINE pack<T> fma(const pack<T> &a0, const pack<T> &a1,
function NSIMD_INLINE (line 140) | NSIMD_INLINE pack<T> min(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 147) | NSIMD_INLINE pack<T> max(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 158) | NSIMD_INLINE packl<T> eq(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 165) | NSIMD_INLINE pack<T> operator==(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 170) | NSIMD_INLINE packl<T> ne(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 177) | NSIMD_INLINE pack<T> operator!=(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 182) | NSIMD_INLINE packl<T> le(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 189) | NSIMD_INLINE pack<T> operator<=(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 194) | NSIMD_INLINE packl<T> lt(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 201) | NSIMD_INLINE pack<T> operator<(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 206) | NSIMD_INLINE packl<T> ge(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 213) | NSIMD_INLINE pack<T> operator>=(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 218) | NSIMD_INLINE packl<T> gt(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 225) | NSIMD_INLINE pack<T> operator>(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 230) | NSIMD_INLINE pack<T> if_else1(const packl<T> &a0, const pack<T> &a1,
function NSIMD_INLINE (line 242) | NSIMD_INLINE pack<T> andb(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 249) | NSIMD_INLINE packl<T> andl(const packl<T> &a0, const packl<T> &a1) {
function NSIMD_INLINE (line 256) | NSIMD_INLINE pack<T> andnotb(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 263) | NSIMD_INLINE packl<T> andnotl(const packl<T> &a0, const packl<T> &a1) {
function NSIMD_INLINE (line 269) | NSIMD_INLINE pack<T> notb(pack<T> a0) {
function NSIMD_INLINE (line 275) | NSIMD_INLINE packl<T> notl(packl<T> a0) {
function NSIMD_INLINE (line 282) | NSIMD_INLINE pack<T> orb(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 289) | NSIMD_INLINE packl<T> orl(const packl<T> &a0, const packl<T> &a1) {
function NSIMD_INLINE (line 296) | NSIMD_INLINE pack<T> xorb(const pack<T> &a0, const pack<T> &a1) {
function NSIMD_INLINE (line 303) | NSIMD_INLINE packl<T> xorl(const packl<T> &a0, const packl<T> &a1) {
function NSIMD_INLINE (line 313) | NSIMD_INLINE pack<T> abs(pack<T> a0) {
function NSIMD_INLINE (line 319) | NSIMD_INLINE pack<T> rec(pack<T> a0) {
function NSIMD_INLINE (line 329) | NSIMD_INLINE T set1(typename T::value_type a0) {
function NSIMD_INLINE (line 335) | NSIMD_INLINE T loadu(typename T::value_type *p) {
function NSIMD_INLINE (line 341) | NSIMD_INLINE T loada(typename T::value_type *p) {
function NSIMD_INLINE (line 347) | NSIMD_INLINE T loadlu(typename T::value_type *p) {
function NSIMD_INLINE (line 353) | NSIMD_INLINE T loadla(typename T::value_type *p) {
function NSIMD_INLINE (line 364) | NSIMD_INLINE void storeu(typename T::value_type *p, T v) {
function NSIMD_INLINE (line 369) | NSIMD_INLINE void storea(typename T::value_type *p, T v) {
function NSIMD_INLINE (line 374) | NSIMD_INLINE void storelu(typename T::value_type *p, T v) {
function NSIMD_INLINE (line 379) | NSIMD_INLINE void storela(typename T::value_type *p, T v) {
FILE: include/nsimd/modules/memory_management.hpp
type nsimd (line 33) | namespace nsimd {
function T (line 40) | T *device_malloc(size_t sz) {
function T (line 48) | T *device_calloc(size_t sz) {
function device_free (line 60) | void device_free(T *ptr) { cudaFree((void *)ptr); }
function copy_to_device (line 63) | void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {
function copy_to_host (line 69) | void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {
function T (line 93) | T *device_malloc(size_t sz) {
function T (line 101) | T *device_calloc(size_t sz) {
function device_free (line 113) | void device_free(T *ptr) { hipFree((void *)ptr); }
function copy_to_device (line 116) | void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {
function copy_to_host (line 121) | void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {
function T (line 146) | T *device_malloc(const size_t sz) {
function T (line 150) | T *device_calloc(const size_t sz) {
function device_free (line 160) | void device_free(T *const ptr) {
function copy_to_device (line 166) | void copy_to_device(T *const device_ptr, const T *const host_ptr,
function copy_to_host (line 174) | void copy_to_host(T *const host_ptr, const T *const device_ptr, size_t...
function T (line 207) | T *device_malloc(size_t sz) {
function T (line 211) | T *device_calloc(size_t sz) {
function device_free (line 215) | void device_free(T *ptr) { free((void *)ptr); }
function copy_to_device (line 218) | void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {
function copy_to_host (line 223) | void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {
type paired_pointers_t (line 240) | struct paired_pointers_t {
function pair_malloc (line 245) | paired_pointers_t<T> pair_malloc(size_t sz) {
function pair_malloc_or_exit (line 268) | paired_pointers_t<T> pair_malloc_or_exit(size_t sz) {
function pair_calloc (line 278) | paired_pointers_t<T> pair_calloc(size_t sz) {
function pair_calloc_or_exit (line 301) | paired_pointers_t<T> pair_calloc_or_exit(size_t sz) {
function pair_free (line 311) | void pair_free(paired_pointers_t<T> p) {
function copy_to_device (line 320) | void copy_to_device(paired_pointers_t<T> p) {
function copy_to_host (line 328) | void copy_to_host(paired_pointers_t<T> p) {
FILE: include/nsimd/modules/spmd.hpp
type spmd (line 34) | namespace spmd {
type type_t (line 184) | struct type_t {}
type type_t<8> (line 187) | struct type_t<8> {
type type_t<16> (line 193) | struct type_t<16> {
type type_t<32> (line 200) | struct type_t<32> {
type type_t<64> (line 207) | struct type_t<64> {
function to_pack (line 282) | nsimd::pack<T, N> to_pack(T a) {
function to_pack (line 287) | nsimd::pack<T, N, SimdExt> to_pack(nsimd::pack<T, N, SimdExt> const &a) {
function to_packl (line 291) | nsimd::packl<T, N> to_packl(bool a) {
function to_packl (line 296) | nsimd::packl<T, N> to_packl(Pack const &a) {
type base_type (line 300) | struct base_type { typedef T type; }
type base_type<nsimd::pack<T, N, SimdExt> > (line 303) | struct base_type<nsimd::pack<T, N, SimdExt> > {
type base_type<nsimd::packl<T, N, SimdExt> > (line 308) | struct base_type<nsimd::packl<T, N, SimdExt> > {
type KernelScalar (line 313) | struct KernelScalar {}
type KernelSIMD (line 314) | struct KernelSIMD {}
type type_t (line 410) | struct type_t {}
type type_t<KernelScalar, 8, N> (line 413) | struct type_t<KernelScalar, 8, N> {
type type_t<KernelScalar, 16, N> (line 419) | struct type_t<KernelScalar, 16, N> {
type type_t<KernelScalar, 32, N> (line 426) | struct type_t<KernelScalar, 32, N> {
type type_t<KernelScalar, 64, N> (line 433) | struct type_t<KernelScalar, 64, N> {
type type_t<KernelSIMD, 8, N> (line 441) | struct type_t<KernelSIMD, 8, N> {
type type_t<KernelSIMD, 16, N> (line 447) | struct type_t<KernelSIMD, 16, N> {
type type_t<KernelSIMD, 32, N> (line 454) | struct type_t<KernelSIMD, 32, N> {
type type_t<KernelSIMD, 64, N> (line 461) | struct type_t<KernelSIMD, 64, N> {
type store_helper (line 479) | struct store_helper {}
type load_helper (line 480) | struct load_helper {}
type store_helper<KernelScalar> (line 495) | struct store_helper<KernelScalar> {
method impl (line 497) | static void impl(bool mask, T *addr, S value) {
method unmasked_impl (line 504) | static void unmasked_impl(T *addr, S value) {
type load_helper<KernelScalar> (line 509) | struct load_helper<KernelScalar> {
method T (line 510) | static T impl(bool mask, T *addr) {
method T (line 518) | static T unmasked_impl(T *addr) {
type store_helper<KernelSIMD> (line 523) | struct store_helper<KernelSIMD> {
method impl (line 525) | static void impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr,
method impl (line 531) | static void impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr,
method unmasked_impl (line 538) | static void unmasked_impl(T *addr, nsimd::pack<T, N, SimdExt> const ...
method unmasked_impl (line 543) | static void unmasked_impl(T *addr, S value) {
type load_helper<KernelSIMD> (line 548) | struct load_helper<KernelSIMD> {
method impl (line 550) | static nsimd::pack<S, N, SimdExt>
method unmasked_impl (line 556) | static nsimd::pack<T, N> unmasked_impl(T *addr) {
function clear_lanes (line 567) | nsimd::packl<T, N, SimdExt>
function clear_lanes (line 573) | inline bool clear_lanes(bool mask, bool lanes) { return lanes ? false ...
function k_set_ (line 576) | void k_set_(bool mask, T &var, S value) {
function k_set_ (line 583) | void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
function k_set_ (line 589) | void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
function k_set_ (line 596) | void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
function k_set_ (line 603) | void k_set_(nsimd::packl<T, N, SimdExt> const &mask,
function any (line 616) | bool any(nsimd::packl<T, N, SimdExt> const a) {
function to_k_bool_ (line 621) | typename type_t<KernelType, ScalarBits, N>::btype to_k_bool_(Packl con...
function to_k_bool_ (line 627) | inline bool to_k_bool_(bool a) {
function any (line 634) | inline bool any(bool a) { return a; }
FILE: include/nsimd/modules/tet1d.hpp
type tet1d (line 34) | namespace tet1d {
type none_t (line 39) | struct none_t {}
type node (line 42) | struct node {}
function gpuCheck (line 51) | inline void gpuCheck(cudaError_t code, const char *file, int line) {
function __global__ (line 67) | __global__ void gpu_kernel_component_wise(T *dst, Expr const expr,
function __global__ (line 77) | __global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask,
function __global__ (line 90) | __global__ void gpu_kernel_component_wise(T *dst, Expr const expr,
function __global__ (line 100) | __global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask,
function oneapi_kernel_component_wise (line 113) | void oneapi_kernel_component_wise(T *dst, Expr const expr,
function oneapi_kernel_component_wise_mask (line 123) | void oneapi_kernel_component_wise_mask(T *dst, Mask const mask,
function cpu_kernel_component_wise (line 138) | void cpu_kernel_component_wise(T *dst, Expr const &expr, nsimd::nat n) {
function cpu_kernel_component_wise_mask (line 151) | void cpu_kernel_component_wise_mask(T *dst, Mask const &mask, Expr con...
function compute_size (line 172) | nsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2) {
function compute_size (line 182) | nsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2, nsimd::nat sz3) {
type to_pack_t (line 189) | struct to_pack_t {
type to_pack_t<nsimd::pack<T, Unroll, SimdExt>, Pack> (line 196) | struct to_pack_t<nsimd::pack<T, Unroll, SimdExt>, Pack> {
type to_packl_t (line 202) | struct to_packl_t {
type to_packl_t<nsimd::pack<T, Unroll, SimdExt>, Pack> (line 209) | struct to_packl_t<nsimd::pack<T, Unroll, SimdExt>, Pack> {
type scalar_t (line 218) | struct scalar_t {}
type node<scalar_t, none_t, none_t, T> (line 220) | struct node<scalar_t, none_t, none_t, T> {
method __device__ (line 226) | __device__ T gpu_get(nsimd::nat) const { return value; }
method T (line 228) | T gpu_get(nsimd::nat) const { return value; }
method T (line 230) | T scalar_get(nsimd::nat) const { return value; }
method simd_get (line 232) | typename to_pack_t<T, Pack>::type simd_get(nsimd::nat) const {
method size (line 238) | nsimd::nat size() const { return -1; }
type to_node_t (line 244) | struct to_node_t {
method type (line 247) | static type impl(T n) {
type to_node_t<node<Op, Left, Right, Extra> > (line 255) | struct to_node_t<node<Op, Left, Right, Extra> > {
method type (line 257) | static type impl(type node) { return node; }
function to_node (line 260) | typename to_node_t<T>::type to_node(T n) {
type literal_to (line 267) | struct literal_to {
method T (line 268) | static T impl(S a) { return T(a); }
type literal_to<f16> (line 271) | struct literal_to<f16> {
method f16 (line 272) | static f16 impl(S a) {
type in_t (line 280) | struct in_t {}
type node<in_t, none_t, none_t, T> (line 284) | struct node<in_t, none_t, none_t, T> {
method __device__ (line 291) | __device__ T gpu_get(nsimd::nat i) const { return data[i]; }
method T (line 293) | T gpu_get(nsimd::nat i) const { return data[i]; }
method T (line 295) | T scalar_get(nsimd::nat i) const { return data[i]; }
method simd_get (line 297) | typename to_pack_t<T, Pack>::type simd_get(nsimd::nat i) const {
method size (line 303) | nsimd::nat size() const { return sz; }
function in (line 321) | inline node<in_t, none_t, none_t, T> in(const T *data, I sz) {
type mask_out_t (line 331) | struct mask_out_t {}
type node<mask_out_t, Mask, none_t, Pack> (line 334) | struct node<mask_out_t, Mask, none_t, Pack> {
type out_t (line 389) | struct out_t {}
type node<out_t, none_t, none_t, Pack> (line 398) | struct node<out_t, none_t, none_t, Pack> {
function out (line 455) | node<out_t, none_t, none_t, nsimd::pack<T> > out(T *data) {
function out (line 464) | node<out_t, none_t, none_t, Pack> out(T *data, int threads_per_block,
FILE: include/nsimd/nsimd.h
type nsimd_longlong (line 131) | typedef long long nsimd_longlong;
type nsimd_ulonglong (line 132) | typedef unsigned long long nsimd_ulonglong;
function namespace (line 135) | namespace nsimd {
type __UINT64_TYPE__ (line 142) | typedef __UINT64_TYPE__ nsimd_uint64_type;
type __INT64_TYPE__ (line 146) | typedef __INT64_TYPE__ nsimd_int64_type;
function namespace (line 387) | namespace nsimd {
function namespace (line 413) | namespace nsimd {
function namespace (line 438) | namespace nsimd {
function namespace (line 465) | namespace nsimd {
function namespace (line 490) | namespace nsimd {
function namespace (line 517) | namespace nsimd {
function namespace (line 544) | namespace nsimd {
function namespace (line 563) | namespace nsimd {
function namespace (line 583) | namespace nsimd {
function namespace (line 605) | namespace nsimd {
function namespace (line 627) | namespace nsimd {
function namespace (line 649) | namespace nsimd {
function namespace (line 671) | namespace nsimd {
function namespace (line 693) | namespace nsimd {
function namespace (line 732) | namespace nsimd {
function namespace (line 769) | namespace nsimd {
function namespace (line 815) | namespace nsimd {
function namespace (line 834) | namespace nsimd {
type sycl (line 871) | typedef sycl::cl_char i8;
type sycl (line 872) | typedef sycl::cl_uchar u8;
type sycl (line 873) | typedef sycl::cl_short i16;
type sycl (line 874) | typedef sycl::cl_ushort u16;
type sycl (line 875) | typedef sycl::cl_int i32;
type sycl (line 876) | typedef sycl::cl_uint u32;
type sycl (line 877) | typedef sycl::cl_long i64;
type sycl (line 878) | typedef sycl::cl_ulong u64;
type u8 (line 880) | typedef unsigned __int8 u8;
type i8 (line 881) | typedef signed __int8 i8;
type u16 (line 882) | typedef unsigned __int16 u16;
type i16 (line 883) | typedef signed __int16 i16;
type u32 (line 884) | typedef unsigned __int32 u32;
type i32 (line 885) | typedef signed __int32 i32;
type u64 (line 886) | typedef unsigned __int64 u64;
type i64 (line 887) | typedef signed __int64 i64;
type u8 (line 889) | typedef unsigned char u8;
type i8 (line 890) | typedef signed char i8;
type u16 (line 891) | typedef unsigned short u16;
type i16 (line 892) | typedef signed short i16;
type __UINT32_TYPE__ (line 894) | typedef __UINT32_TYPE__ u32;
type u32 (line 897) | typedef unsigned long u32;
type u32 (line 899) | typedef unsigned int u32;
type __INT32_TYPE__ (line 903) | typedef __INT32_TYPE__ i32;
type i32 (line 906) | typedef signed long i32;
type i32 (line 908) | typedef signed int i32;
type nsimd_ulonglong (line 912) | typedef nsimd_ulonglong u64;
type nsimd_longlong (line 913) | typedef nsimd_longlong i64;
type nsimd_uint64_type (line 916) | typedef nsimd_uint64_type u64;
type u64 (line 918) | typedef unsigned long u64;
type nsimd_int64_type (line 921) | typedef nsimd_int64_type i64;
type i64 (line 923) | typedef signed long i64;
type nsimd_ulonglong (line 927) | typedef nsimd_ulonglong u64;
type nsimd_longlong (line 928) | typedef nsimd_longlong i64;
type u64 (line 930) | typedef unsigned long long u64;
type i64 (line 931) | typedef signed long long i64;
type __fp16 (line 977) | typedef __fp16 f16;
type __half (line 980) | typedef __half f16;
type sycl (line 983) | typedef sycl::half f16;
type f16 (line 986) | typedef struct { u16 u; } f16;
type sycl (line 990) | typedef sycl::cl_float f32;
type sycl (line 991) | typedef sycl::cl_double f64;
type f32 (line 993) | typedef float f32;
type f64 (line 994) | typedef double f64;
type i64 (line 1001) | typedef i64 nsimd_nat;
type i32 (line 1003) | typedef i32 nsimd_nat;
function namespace (line 1007) | namespace nsimd {
function namespace (line 1017) | namespace nsimd {
function NSIMD_INLINE (line 1141) | NSIMD_INLINE int nsimd_popcnt32_(u32 a) {
function NSIMD_INLINE (line 1155) | NSIMD_INLINE int nsimd_popcnt64_(u64 a) {
function namespace (line 1205) | namespace nsimd {
type vi8 (line 1236) | typedef vec(i8) vi8;
type vu8 (line 1237) | typedef vec(u8) vu8;
type vi16 (line 1238) | typedef vec(i16) vi16;
type vu16 (line 1239) | typedef vec(u16) vu16;
type vi32 (line 1240) | typedef vec(i32) vi32;
type vu32 (line 1241) | typedef vec(u32) vu32;
type vi64 (line 1242) | typedef vec(i64) vi64;
type vu64 (line 1243) | typedef vec(u64) vu64;
type vf16 (line 1244) | typedef vec(f16) vf16;
type vf32 (line 1245) | typedef vec(f32) vf32;
type vf64 (line 1246) | typedef vec(f64) vf64;
type vi8x2 (line 1248) | typedef vecx2(i8) vi8x2;
type vu8x2 (line 1249) | typedef vecx2(u8) vu8x2;
type vi16x2 (line 1250) | typedef vecx2(i16) vi16x2;
type vu16x2 (line 1251) | typedef vecx2(u16) vu16x2;
type vi32x2 (line 1252) | typedef vecx2(i32) vi32x2;
type vu32x2 (line 1253) | typedef vecx2(u32) vu32x2;
type vi64x2 (line 1254) | typedef vecx2(i64) vi64x2;
type vu64x2 (line 1255) | typedef vecx2(u64) vu64x2;
type vf16x2 (line 1256) | typedef vecx2(f16) vf16x2;
type vf32x2 (line 1257) | typedef vecx2(f32) vf32x2;
type vf64x2 (line 1258) | typedef vecx2(f64) vf64x2;
type vi8x3 (line 1260) | typedef vecx3(i8) vi8x3;
type vu8x3 (line 1261) | typedef vecx3(u8) vu8x3;
type vi16x3 (line 1262) | typedef vecx3(i16) vi16x3;
type vu16x3 (line 1263) | typedef vecx3(u16) vu16x3;
type vi32x3 (line 1264) | typedef vecx3(i32) vi32x3;
type vu32x3 (line 1265) | typedef vecx3(u32) vu32x3;
type vi64x3 (line 1266) | typedef vecx3(i64) vi64x3;
type vu64x3 (line 1267) | typedef vecx3(u64) vu64x3;
type vf16x3 (line 1268) | typedef vecx3(f16) vf16x3;
type vf32x3 (line 1269) | typedef vecx3(f32) vf32x3;
type vf64x3 (line 1270) | typedef vecx3(f64) vf64x3;
type vi8x4 (line 1272) | typedef vecx4(i8) vi8x4;
type vu8x4 (line 1273) | typedef vecx4(u8) vu8x4;
type vi16x4 (line 1274) | typedef vecx4(i16) vi16x4;
type vu16x4 (line 1275) | typedef vecx4(u16) vu16x4;
type vi32x4 (line 1276) | typedef vecx4(i32) vi32x4;
type vu32x4 (line 1277) | typedef vecx4(u32) vu32x4;
type vi64x4 (line 1278) | typedef vecx4(i64) vi64x4;
type vu64x4 (line 1279) | typedef vecx4(u64) vu64x4;
type vf16x4 (line 1280) | typedef vecx4(f16) vf16x4;
type vf32x4 (line 1281) | typedef vecx4(f32) vf32x4;
type vf64x4 (line 1282) | typedef vecx4(f64) vf64x4;
type vli8 (line 1284) | typedef vecl(i8) vli8;
type vlu8 (line 1285) | typedef vecl(u8) vlu8;
type vli16 (line 1286) | typedef vecl(i16) vli16;
type vlu16 (line 1287) | typedef vecl(u16) vlu16;
type vli32 (line 1288) | typedef vecl(i32) vli32;
type vlu32 (line 1289) | typedef vecl(u32) vlu32;
type vli64 (line 1290) | typedef vecl(i64) vli64;
type vlu64 (line 1291) | typedef vecl(u64) vlu64;
type vlf16 (line 1292) | typedef vecl(f16) vlf16;
type vlf32 (line 1293) | typedef vecl(f32) vlf32;
type vlf64 (line 1294) | typedef vecl(f64) vlf64;
function namespace (line 1301) | namespace nsimd {
function namespace (line 1366) | namespace nsimd {
function namespace (line 1442) | namespace nsimd {
function namespace (line 1467) | namespace nsimd {
function requires (line 1563) | requires std::integral<I>
function T (line 1569) | const T *get() const { return &data[0]; }
function T (line 1570) | T *get() { return &data[0]; }
function NSIMD_INLINE (line 1593) | NSIMD_INLINE f16 nsimd_f32_to_f16(f32 a) { return (f16)a; }
function NSIMD_INLINE (line 1594) | NSIMD_INLINE f32 nsimd_f16_to_f32(f16 a) { return (f32)a; }
function f16 (line 1597) | inline f16 nsimd_f32_to_f16(f32 a) { return __float2half(a); }
function f32 (line 1598) | inline f32 nsimd_f16_to_f32(f16 a) { return __half2float(a); }
function f16 (line 1600) | inline f16 nsimd_f32_to_f16(f32 a) {
function f32 (line 1604) | inline f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(*(u16 *)&a); }
function f16 (line 1606) | inline f16 nsimd_f32_to_f16(f32 a) { return static_cast<sycl::half>(a); }
function f32 (line 1607) | inline f32 nsimd_f16_to_f32(f16 a) { return static_cast<float>(a); }
function namespace (line 1621) | namespace nsimd {
function NSIMD_INLINE (line 1637) | NSIMD_INLINE u64 nsimd_to_biggest_u8(u8 a) { return (u64)a; }
function NSIMD_INLINE (line 1638) | NSIMD_INLINE u64 nsimd_to_biggest_u16(u16 a) { return (u64)a; }
function NSIMD_INLINE (line 1639) | NSIMD_INLINE u64 nsimd_to_biggest_u32(u32 a) { return (u64)a; }
function NSIMD_INLINE (line 1640) | NSIMD_INLINE u64 nsimd_to_biggest_u64(u64 a) { return a; }
function NSIMD_INLINE (line 1641) | NSIMD_INLINE i64 nsimd_to_biggest_i8(i8 a) { return (i64)a; }
function NSIMD_INLINE (line 1642) | NSIMD_INLINE i64 nsimd_to_biggest_i16(i16 a) { return (i64)a; }
function NSIMD_INLINE (line 1643) | NSIMD_INLINE i64 nsimd_to_biggest_i32(i32 a) { return (i64)a; }
function NSIMD_INLINE (line 1644) | NSIMD_INLINE i64 nsimd_to_biggest_i64(i64 a) { return a; }
function NSIMD_INLINE (line 1645) | NSIMD_INLINE f64 nsimd_to_biggest_f16(f16 a) {
function NSIMD_INLINE (line 1648) | NSIMD_INLINE f64 nsimd_to_biggest_f32(f32 a) { return (f64)a; }
function NSIMD_INLINE (line 1649) | NSIMD_INLINE f64 nsimd_to_biggest_f64(f64 a) { return a; }
function namespace (line 1652) | namespace nsimd {
function namespace (line 1671) | namespace nsimd {
function namespace (line 1761) | namespace nsimd {
function namespace (line 1855) | namespace nsimd {
function NSIMD_INLINE (line 2115) | NSIMD_INLINE int nsimd_isnan_f16(f16 a) {
function NSIMD_INLINE (line 2125) | NSIMD_INLINE int nsimd_isnan_f32(f32 a) {
function NSIMD_INLINE (line 2135) | NSIMD_INLINE int nsimd_isnan_f64(f64 a) {
function NSIMD_INLINE (line 2145) | NSIMD_INLINE int nsimd_isinf_f16(f16 a) {
function NSIMD_INLINE (line 2155) | NSIMD_INLINE int nsimd_isinf_f32(f32 a) {
function NSIMD_INLINE (line 2165) | NSIMD_INLINE int nsimd_isinf_f64(f64 a) {
function NSIMD_INLINE (line 2175) | NSIMD_INLINE int nsimd_isnormal_f16(f16 a) {
function NSIMD_INLINE (line 2185) | NSIMD_INLINE int nsimd_isnormal_f32(f32 a) {
function NSIMD_INLINE (line 2195) | NSIMD_INLINE int nsimd_isnormal_f64(f64 a) {
function namespace (line 2206) | namespace nsimd {
function namespace (line 2235) | namespace nsimd {
FILE: scripts/one-liner.c
function main (line 55) | int main(int argc, char **argv) {
FILE: src/dd.h
type vdouble2 (line 7) | typedef struct {
function vdouble (line 11) | static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
function vdouble (line 12) | static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
function vdouble2 (line 13) | static vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { vdouble2 v; ...
function vdouble2 (line 14) | static vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; ret...
function vdouble2 (line 15) | static vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; ret...
function VECTOR_CC (line 18) | VECTOR_CC vdouble vupper_vd_vd(vdouble d) {
function VECTOR_CC (line 22) | VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
function VECTOR_CC (line 26) | VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {
function VECTOR_CC (line 30) | VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 35) | VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, ...
function VECTOR_CC (line 40) | VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
function VECTOR_CC (line 44) | VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 48) | VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 52) | VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 56) | VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 60) | VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
function VECTOR_CC (line 64) | VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 68) | VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 72) | VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdoubl...
function VECTOR_CC (line 78) | VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {
function VECTOR_CC (line 82) | VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {
function VECTOR_CC (line 89) | VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
function VECTOR_CC (line 94) | VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
function VECTOR_CC (line 98) | VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
function VECTOR_CC (line 103) | VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
function VECTOR_CC (line 109) | VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
function VECTOR_CC (line 114) | VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
function VECTOR_CC (line 119) | VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
function VECTOR_CC (line 126) | VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
function VECTOR_CC (line 131) | VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
function VECTOR_CC (line 138) | VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 145) | VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 152) | VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
function VECTOR_CC (line 159) | VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 170) | VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
function VECTOR_CC (line 178) | VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
function VECTOR_CC (line 183) | VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
function VECTOR_CC (line 188) | VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 193) | VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 197) | VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
function VECTOR_CC (line 201) | VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
function VECTOR_CC (line 206) | VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
function VECTOR_CC (line 211) | VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
function VECTOR_CC (line 216) | VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
function VECTOR_CC (line 230) | VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
function VECTOR_CC (line 238) | VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
function VECTOR_CC (line 246) | VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 254) | VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
function VECTOR_CC (line 261) | VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
function VECTOR_CC (line 268) | VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
function VECTOR_CC (line 274) | VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
function VECTOR_CC (line 282) | VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
function VECTOR_CC (line 291) | VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
function VECTOR_CC (line 296) | VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {
FILE: src/df.h
type vfloat2 (line 7) | typedef struct {
function vfloat (line 11) | static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
function vfloat (line 12) | static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
function vfloat2 (line 13) | static vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { vfloat2 v; v.x ...
function vfloat2 (line 14) | static vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return...
function vfloat2 (line 15) | static vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return...
function VECTOR_CC (line 18) | VECTOR_CC vfloat vupper_vf_vf(vfloat d) {
function VECTOR_CC (line 22) | VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
function VECTOR_CC (line 26) | VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {
function VECTOR_CC (line 30) | VECTOR_CC vfloat2 vcast_vf2_d(double d) {
function VECTOR_CC (line 34) | VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 38) | VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, flo...
function VECTOR_CC (line 42) | VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0...
function VECTOR_CC (line 46) | VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopm...
function VECTOR_CC (line 50) | VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {
function VECTOR_CC (line 55) | VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
function VECTOR_CC (line 59) | VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
function VECTOR_CC (line 63) | VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3,...
function VECTOR_CC (line 67) | VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3,...
function VECTOR_CC (line 71) | VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3,...
function VECTOR_CC (line 75) | VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
function VECTOR_CC (line 79) | VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
function VECTOR_CC (line 83) | VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3,...
function VECTOR_CC (line 89) | VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {
function VECTOR_CC (line 93) | VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {
function VECTOR_CC (line 98) | VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
function VECTOR_CC (line 103) | VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
function VECTOR_CC (line 107) | VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
function VECTOR_CC (line 112) | VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
function VECTOR_CC (line 118) | VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
function VECTOR_CC (line 125) | VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
function VECTOR_CC (line 130) | VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
function VECTOR_CC (line 135) | VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
function VECTOR_CC (line 142) | VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
function VECTOR_CC (line 147) | VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 154) | VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 161) | VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
function VECTOR_CC (line 168) | VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 179) | VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
function VECTOR_CC (line 187) | VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
function VECTOR_CC (line 192) | VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
function VECTOR_CC (line 197) | VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
function VECTOR_CC (line 201) | VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 206) | VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 210) | VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
function VECTOR_CC (line 215) | VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
function VECTOR_CC (line 220) | VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
function VECTOR_CC (line 225) | VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
function VECTOR_CC (line 250) | VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
function VECTOR_CC (line 264) | VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
function VECTOR_CC (line 279) | VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 295) | VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
function VECTOR_CC (line 302) | VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
function VECTOR_CC (line 315) | VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
function VECTOR_CC (line 321) | VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
function VECTOR_CC (line 335) | VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
function VECTOR_CC (line 351) | VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
function VECTOR_CC (line 362) | VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {
FILE: src/fp16.cpp
function NSIMD_DLLEXPORT (line 90) | NSIMD_DLLEXPORT float nsimd_u16_to_f32(u16 a) {
function NSIMD_DLLEXPORT (line 149) | NSIMD_DLLEXPORT f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(a....
function NSIMD_DLLEXPORT (line 155) | NSIMD_DLLEXPORT u16 nsimd_f32_to_u16(f32 a) {
function NSIMD_DLLEXPORT (line 252) | NSIMD_DLLEXPORT f16 nsimd_f32_to_f16(f32 a) {
type nsimd (line 268) | namespace nsimd {
function NSIMD_DLLEXPORT (line 270) | NSIMD_DLLEXPORT u16 f32_to_u16(f32 a) { return nsimd_f32_to_u16(a); }
function NSIMD_DLLEXPORT (line 271) | NSIMD_DLLEXPORT f32 u16_to_f32(u16 a) { return nsimd_u16_to_f32(a); }
function NSIMD_DLLEXPORT (line 273) | NSIMD_DLLEXPORT f16 f32_to_f16(f32 a) { return nsimd_f32_to_f16(a); }
function NSIMD_DLLEXPORT (line 274) | NSIMD_DLLEXPORT f32 f16_to_f32(f16 a) { return nsimd_f16_to_f32(a); }
FILE: src/gpu.cpp
type nsimd (line 34) | namespace nsimd {
type oneapi (line 35) | namespace oneapi {
type sycl_async_error_handler (line 37) | struct sycl_async_error_handler {
function NSIMD_DLLSPEC (line 56) | NSIMD_DLLSPEC void *nsimd_oneapi_default_queue() {
function NSIMD_DLLSPEC (line 62) | NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,
function NSIMD_DLLSPEC (line 74) | NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,
function NSIMD_DLLSPEC (line 84) | NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,
FILE: src/helperadvsimd.h
type uint32x4_t (line 48) | typedef uint32x4_t vmask;
type uint32x4_t (line 49) | typedef uint32x4_t vopmask;
type float32x4_t (line 52) | typedef float32x4_t vfloat;
type int32x4_t (line 53) | typedef int32x4_t vint2;
type float64x2_t (line 56) | typedef float64x2_t vdouble;
type int32x2_t (line 57) | typedef int32x2_t vint;
type vmask2 (line 59) | typedef struct {
function INLINE (line 65) | static INLINE int vavailability_i(int name) { return 3; }
function INLINE (line 66) | static INLINE void vprefetch_v_p(const void *ptr) { }
function vtestallones_i_vo32 (line 68) | int vtestallones_i_vo32(vopmask g) {
function vtestallones_i_vo64 (line 74) | int vtestallones_i_vo64(vopmask g) {
function vdouble (line 81) | vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
function vdouble (line 82) | vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
function vstore_v_p_vd (line 83) | void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
function vstoreu_v_p_vd (line 84) | void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
function vfloat (line 85) | vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
function vfloat (line 86) | vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
function vstore_v_p_vf (line 87) | void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
function vstoreu_v_p_vf (line 88) | void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
function vint2 (line 89) | vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
function vstoreu_v_p_vi2 (line 90) | void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
function vint (line 91) | vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
function vstoreu_v_p_vi (line 92) | void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
function vdouble (line 94) | vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
function vfloat (line 98) | vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
function vmask (line 108) | vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
function vmask (line 109) | vmask vandnot_vm_vm_vm(vmask x, vmask y) {
function vmask (line 112) | vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
function vmask (line 113) | vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
function vmask (line 116) | vmask vreinterpret_vm_vf(vfloat vf) {
function vfloat (line 119) | vfloat vreinterpret_vf_vm(vmask vm) {
function vint2 (line 122) | vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
function vmask (line 123) | vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }
function vmask (line 126) | vmask vreinterpret_vm_vd(vdouble vd) {
function vdouble (line 129) | vdouble vreinterpret_vd_vm(vmask vm) {
function vfloat (line 132) | vfloat vreinterpret_vf_vi2(vint2 vm) {
function vint2 (line 135) | vint2 vreinterpret_vi2_vf(vfloat vf) {
function vint2 (line 138) | vint2 vreinterpret_vi2_vd(vdouble vd) {
function vfloat (line 146) | vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
function vfloat (line 149) | vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
function vfloat (line 152) | vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
function vfloat (line 155) | vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
function vfloat (line 160) | vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
function vfloat (line 161) | vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
function vfloat (line 165) | vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
function vfloat (line 169) | vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
function vfloat (line 173) | vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
function vfloat (line 177) | vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_v...
function vfloat (line 178) | vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf...
function vfloat (line 179) | vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf...
function vfloat (line 182) | vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
function vfloat (line 186) | vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
function vfloat (line 190) | vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
function vfloat (line 195) | vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
function vfloat (line 216) | vfloat vrec_vf_vf(vfloat d) {
function vfloat (line 225) | vfloat vsqrt_vf_vf(vfloat d) {
function vfloat (line 251) | vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
function vfloat (line 254) | vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
function vmask (line 259) | vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
function vmask (line 260) | vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
function vmask (line 263) | vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
function vmask (line 264) | vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
function vmask (line 265) | vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
function vmask (line 266) | vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
function vfloat (line 269) | vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
function vint2 (line 274) | vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
function vfloat (line 275) | vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
function vint2 (line 276) | vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
function vint2 (line 277) | vint2 vrint_vi2_vf(vfloat d) {
function vint2 (line 286) | vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 289) | vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 292) | vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
function vint2 (line 295) | vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 298) | vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 301) | vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 304) | vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
function vmask (line 326) | vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
function vmask (line 327) | vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
function vint2 (line 329) | vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 332) | vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
function vint2 (line 337) | vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
function vdouble (line 350) | vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
function vdouble (line 353) | vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
function vdouble (line 356) | vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
function vdouble (line 359) | vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
function vdouble (line 364) | vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
function vdouble (line 365) | vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }
function vdouble (line 368) | vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
function vdouble (line 371) | vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
function vdouble (line 377) | vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
function vdouble (line 381) | vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
function vdouble (line 386) | vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
function vdouble (line 390) | vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_...
function vdouble (line 391) | vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsu...
function vdouble (line 394) | vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
function vdouble (line 398) | vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
function vdouble (line 402) | vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
function vdouble (line 407) | vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
function vdouble (line 429) | vdouble vrec_vd_vd(vdouble d) {
function vdouble (line 438) | vdouble vsqrt_vd_vd(vdouble d) {
function vopmask (line 466) | vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
function vopmask (line 469) | vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
function vopmask (line 472) | vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
function vopmask (line 475) | vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
function vopmask (line 478) | vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
function vopmask (line 481) | vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
function vdouble (line 486) | vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
function VECTOR_CC (line 491) | VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
function vdouble (line 495) | vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1...
function vdouble (line 499) | vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, dou...
function VECTOR_CC (line 505) | VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
function vdouble (line 513) | vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, dou...
function vdouble (line 523) | vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1...
function vdouble (line 528) | vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
function vfloat (line 529) | vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }
function vint (line 534) | vint vtruncate_vi_vd(vdouble vf) {
function vdouble (line 537) | vdouble vcast_vd_vi(vint vi) {
function vint (line 540) | vint vcast_vi_i(int i) { return vdup_n_s32(i); }
function vint (line 541) | vint vrint_vi_vd(vdouble d) {
function vint (line 550) | vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
function vint (line 551) | vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
function vint (line 552) | vint vneg_vi_vi(vint e) { return vneg_s32(e); }
function vint (line 555) | vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
function vint (line 556) | vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
function vint (line 557) | vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
function vint (line 558) | vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }
function vopmask (line 561) | vopmask veq_vo_vi_vi(vint x, vint y) {
function vint (line 566) | vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
function vopmask (line 573) | vopmask visinf_vo_vd(vdouble d) {
function vopmask (line 580) | vopmask visnan_vo_vd(vdouble d) {
function vopmask (line 584) | vopmask vispinf_vo_vd(vdouble d) {
function vopmask (line 588) | vopmask visminf_vo_vd(vdouble d) {
function vfloat (line 592) | vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
function VECTOR_CC (line 596) | VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
function vfloat (line 600) | vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, f...
function vfloat (line 604) | vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, floa...
function vopmask (line 608) | vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
function vopmask (line 611) | vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
function vopmask (line 614) | vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
function vopmask (line 617) | vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
function vopmask (line 620) | vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
function vopmask (line 623) | vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
function vopmask (line 627) | vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
function vopmask (line 630) | vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
function vopmask (line 633) | vopmask vgt_vo_vi_vi(vint x, vint y) {
function vopmask (line 636) | vopmask visinf_vo_vf(vfloat d) {
function vopmask (line 639) | vopmask vispinf_vo_vf(vfloat d) {
function vopmask (line 642) | vopmask visminf_vo_vf(vfloat d) {
function vopmask (line 645) | vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
function vopmask (line 647) | vopmask vcast_vo32_vo64(vopmask m) {
function vopmask (line 650) | vopmask vcast_vo64_vo32(vopmask m) {
function vopmask (line 654) | vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
function vopmask (line 657) | vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
function vopmask (line 660) | vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
function vopmask (line 663) | vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
function vint2 (line 667) | vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
function vint2 (line 670) | vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
function vint2 (line 673) | vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
function vint (line 676) | vint vandnot_vi_vo_vi(vopmask x, vint y) {
function vmask (line 679) | vmask vand_vm_vo32_vm(vopmask x, vmask y) {
function vmask (line 682) | vmask vand_vm_vo64_vm(vopmask x, vmask y) {
function vmask (line 685) | vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
function vmask (line 688) | vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
function vmask (line 691) | vmask vor_vm_vo32_vm(vopmask x, vmask y) {
function vmask (line 694) | vmask vor_vm_vo64_vm(vopmask x, vmask y) {
function vmask (line 697) | vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
function vfloat (line 701) | vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }
function vmask (line 703) | vmask vcast_vm_i_i(int i0, int i1) {
function vopmask (line 707) | vopmask veq64_vo_vm_vm(vmask x, vmask y) {
function vmask (line 711) | vmask vadd64_vm_vm_vm(vmask x, vmask y) {
function vint (line 715) | vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
function vint (line 720) | vint vand_vi_vo_vi(vopmask x, vint y) {
function vint2 (line 724) | vint2 vcastu_vi2_vi(vint vi) {
function vint (line 727) | vint vcastu_vi_vi2(vint2 vi2) {
function vdouble (line 730) | vdouble vreinterpret_vd_vi2(vint2 vi) {
function vdouble (line 733) | vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }
function vdouble (line 742) | vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_...
function vdouble (line 743) | vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_...
function vfloat (line 744) | vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, ...
function vfloat (line 745) | vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, ...
function vdouble (line 747) | vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x,...
function vfloat (line 748) | vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0,...
function vdouble (line 749) | vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return ...
function vfloat (line 750) | vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub...
function vdouble (line 752) | vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget...
function vdouble (line 753) | vdouble vreva2_vd_vd(vdouble vd) { return vd; }
function vstream_v_p_vd (line 755) | void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
function vscatter2_v_p_i_i_vd (line 756) | void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) ...
function vsscatter2_v_p_i_i_vd (line 757) | void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)...
function vfloat (line 759) | vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
function vfloat (line 760) | vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), ...
function vint2 (line 761) | vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(...
function vstream_v_p_vf (line 763) | void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
function vscatter2_v_p_i_i_vf (line 765) | void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
function vsscatter2_v_p_i_i_vf (line 770) | void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
function INLINE (line 777) | static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
function INLINE (line 783) | static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
function INLINE (line 789) | static INLINE vint vuninterleave_vi_vi(vint v) { return v; }
function INLINE (line 790) | static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }
function INLINE (line 791) | static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }
function INLINE (line 792) | static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }
function INLINE (line 793) | static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }
function vmask2 (line 795) | static vmask2 vloadu_vm2_p(void *p) {
type Sleef_quad2 (line 802) | typedef Sleef_quad2 vargquad;
function INLINE (line 804) | static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
function INLINE (line 808) | static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
function INLINE (line 816) | static INLINE int vtestallzeros_i_vo64(vopmask g) {
function INLINE (line 822) | static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { re...
function INLINE (line 824) | static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
function INLINE (line 828) | static INLINE vmask vneg64_vm_vm(vmask x) {
function INLINE (line 832) | static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
function INLINE (line 841) | static INLINE vmask vcast_vm_vi(vint vi) {
function INLINE (line 845) | static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(v...
FILE: src/helperavx.h
type __m256i (line 52) | typedef __m256i vmask;
type __m256i (line 53) | typedef __m256i vopmask;
type __m256d (line 55) | typedef __m256d vdouble;
type __m128i (line 56) | typedef __m128i vint;
type __m256 (line 58) | typedef __m256 vfloat;
type vint2 (line 59) | typedef struct { __m128i x, y; } vint2;
type vmask2 (line 61) | typedef struct {
function Sleef_x86CpuID (line 70) | static inline
function INLINE (line 81) | static INLINE int cpuSupportsAVX() {
function INLINE (line 87) | static INLINE int cpuSupportsFMA4() {
function INLINE (line 94) | static INLINE int vavailability_i(int name) {
function INLINE (line 108) | static INLINE int vavailability_i(int name) {
function INLINE (line 120) | static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _M...
function INLINE (line 122) | static INLINE int vtestallones_i_vo32(vopmask g) {
function INLINE (line 126) | static INLINE int vtestallones_i_vo64(vopmask g) {
function INLINE (line 132) | static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
function INLINE (line 133) | static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castp...
function INLINE (line 134) | static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_casts...
function INLINE (line 135) | static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) {
function INLINE (line 141) | static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) {
function vint2 (line 149) | static vint2 vloadu_vi2_p(int32_t *p) {
function vstoreu_v_p_vi2 (line 156) | static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
function vint (line 161) | static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)...
function vstoreu_v_p_vi (line 162) | static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m12...
function INLINE (line 166) | static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpre...
function INLINE (line 167) | static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinter...
function INLINE (line 168) | static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret...
function INLINE (line 169) | static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpre...
function INLINE (line 171) | static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vrein...
function INLINE (line 172) | static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vr...
function INLINE (line 173) | static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreint...
function INLINE (line 174) | static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vrein...
function INLINE (line 176) | static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 177) | static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vrei...
function INLINE (line 178) | static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinter...
function INLINE (line 179) | static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 181) | static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 182) | static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vrei...
function INLINE (line 183) | static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinter...
function INLINE (line 184) | static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 186) | static INLINE vopmask vcast_vo32_vo64(vopmask o) {
function INLINE (line 190) | static INLINE vopmask vcast_vo64_vo32(vopmask o) {
function INLINE (line 196) | static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(v...
function INLINE (line 197) | static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_ep...
function INLINE (line 198) | static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(v...
function INLINE (line 199) | static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_...
function INLINE (line 200) | static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd,...
function INLINE (line 201) | static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps...
function INLINE (line 202) | static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(v...
function INLINE (line 203) | static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
function INLINE (line 204) | static INLINE vint2 vcastu_vi2_vi(vint vi) {
function INLINE (line 211) | static INLINE vint vcastu_vi_vi2(vint2 vi) {
function INLINE (line 216) | static INLINE vmask vcast_vm_i_i(int i0, int i1) {
function INLINE (line 220) | static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
function INLINE (line 226) | static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 227) | static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 228) | static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 229) | static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 230) | static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm25...
function INLINE (line 231) | static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
function INLINE (line 232) | static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_m...
function INLINE (line 233) | static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm25...
function INLINE (line 234) | static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 235) | static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 238) | static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 239) | static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 240) | static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 242) | static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 243) | static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 244) | static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 245) | static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 246) | static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 247) | static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 248) | static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 249) | static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 252) | static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 253) | static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vrein...
function INLINE (line 254) | static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 255) | static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 256) | static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 257) | static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 261) | static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(...
function INLINE (line 262) | static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(...
function INLINE (line 263) | static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(...
function INLINE (line 265) | static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(...
function INLINE (line 266) | static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_...
function INLINE (line 267) | static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x,...
function INLINE (line 268) | static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(...
function INLINE (line 270) | static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andn...
function INLINE (line 271) | static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si1...
function INLINE (line 273) | static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x...
function INLINE (line 274) | static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x...
function INLINE (line 275) | static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x...
function INLINE (line 277) | static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32...
function INLINE (line 278) | static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32...
function INLINE (line 280) | static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_casts...
function INLINE (line 281) | static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_casts...
function INLINE (line 283) | static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return ...
function INLINE (line 285) | static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) ...
function vdouble (line 287) | vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
function INLINE (line 291) | static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double...
function INLINE (line 295) | static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, v...
function INLINE (line 299) | static INLINE vopmask visinf_vo_vd(vdouble d) {
function INLINE (line 303) | static INLINE vopmask vispinf_vo_vd(vdouble d) {
function INLINE (line 307) | static INLINE vopmask visminf_vo_vd(vdouble d) {
function INLINE (line 311) | static INLINE vopmask visnan_vo_vd(vdouble d) {
function INLINE (line 315) | static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load...
function INLINE (line 316) | static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loa...
function INLINE (line 318) | static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_...
function INLINE (line 319) | static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_store...
function INLINE (line 321) | static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
function INLINE (line 329) | static INLINE double vcast_d_vd(vdouble v) {
function INLINE (line 338) | static INLINE vint2 vcast_vi2_vm(vmask vm) {
function INLINE (line 345) | static INLINE vmask vcast_vm_vi2(vint2 vi) {
function INLINE (line 351) | static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256...
function INLINE (line 352) | static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_m...
function INLINE (line 353) | static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(...
function INLINE (line 354) | static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
function INLINE (line 355) | static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_e...
function INLINE (line 356) | static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps...
function INLINE (line 357) | static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi...
function INLINE (line 359) | static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret...
function INLINE (line 360) | static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm...
function INLINE (line 362) | static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_a...
function INLINE (line 363) | static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_s...
function INLINE (line 364) | static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_m...
function INLINE (line 365) | static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_d...
function INLINE (line 366) | static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_v...
function INLINE (line 367) | static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
function INLINE (line 368) | static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(va...
function INLINE (line 369) | static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vx...
function INLINE (line 370) | static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_m...
function INLINE (line 371) | static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_m...
function INLINE (line 374) | static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 375) | static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 376) | static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 378) | static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 379) | static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 380) | static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 381) | static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 382) | static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 383) | static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 384) | static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 385) | static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 388) | static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 389) | static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinte...
function INLINE (line 390) | static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 391) | static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 392) | static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 393) | static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 395) | static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 400) | static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 405) | static INLINE vint2 vneg_vi2_vi2(vint2 e) {
function INLINE (line 410) | static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 415) | static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 420) | static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 425) | static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 430) | static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi...
function INLINE (line 431) | static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vand...
function INLINE (line 433) | static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
function INLINE (line 438) | static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
function INLINE (line 443) | static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
function INLINE (line 448) | static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 455) | static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 462) | static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 469) | static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 476) | static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
function INLINE (line 482) | static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
function INLINE (line 489) | static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { r...
function vfloat (line 491) | vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
function INLINE (line 495) | static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d...
function INLINE (line 499) | static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vo...
function INLINE (line 503) | static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_...
function INLINE (line 504) | static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, v...
function INLINE (line 505) | static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, v...
function INLINE (line 506) | static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d...
function INLINE (line 510) | static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_p...
function INLINE (line 511) | static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu...
function INLINE (line 513) | static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps...
function INLINE (line 514) | static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_...
function INLINE (line 516) | static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
function INLINE (line 525) | static INLINE float vcast_f_vf(vfloat v) {
function INLINE (line 538) | static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_...
function INLINE (line 539) | static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_...
function INLINE (line 540) | static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm...
function INLINE (line 541) | static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm...
function INLINE (line 543) | static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _m...
function INLINE (line 544) | static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm25...
function INLINE (line 547) | static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdoubl...
function INLINE (line 548) | static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)...
function INLINE (line 550) | static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdoubl...
function INLINE (line 551) | static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)...
function INLINE (line 555) | static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_...
function INLINE (line 556) | static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f12...
function INLINE (line 558) | static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_strea...
function INLINE (line 559) | static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int ste...
function INLINE (line 564) | static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int st...
function INLINE (line 571) | static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(...
function INLINE (line 572) | static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_...
function INLINE (line 573) | static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf...
function INLINE (line 575) | static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_...
function INLINE (line 577) | static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step...
function INLINE (line 584) | static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int ste...
function INLINE (line 588) | static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
function INLINE (line 594) | static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
function INLINE (line 600) | static INLINE vint vuninterleave_vi_vi(vint v) {
function INLINE (line 604) | static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
function INLINE (line 611) | static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
function INLINE (line 618) | static INLINE vmask vinterleave_vm_vm(vmask vm) {
function INLINE (line 625) | static INLINE vmask vuninterleave_vm_vm(vmask vm) {
function vmask2 (line 632) | static vmask2 vloadu_vm2_p(void *p) {
type Sleef_quad4 (line 639) | typedef Sleef_quad4 vargquad;
function INLINE (line 641) | static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
function INLINE (line 645) | static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
function INLINE (line 653) | static INLINE int vtestallzeros_i_vo64(vopmask g) {
function INLINE (line 657) | static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
function INLINE (line 661) | static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
function INLINE (line 668) | static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast...
function INLINE (line 669) | static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
function INLINE (line 686) | static INLINE vmask vcast_vm_vi(vint vi) {
function INLINE (line 692) | static INLINE vint vcast_vi_vm(vmask vm) {
FILE: src/helperavx2.h
type __m256i (line 50) | typedef __m256i vmask;
type __m256i (line 51) | typedef __m256i vopmask;
type __m256d (line 53) | typedef __m256d vdouble;
type __m128i (line 54) | typedef __m128i vint;
type __m256 (line 56) | typedef __m256 vfloat;
type __m256i (line 57) | typedef __m256i vint2;
type vmask2 (line 59) | typedef struct {
function Sleef_x86CpuID (line 68) | static inline
function INLINE (line 79) | static INLINE int cpuSupportsAVX2() {
function INLINE (line 85) | static INLINE int cpuSupportsFMA() {
function INLINE (line 92) | static INLINE int vavailability_i(int name) {
function INLINE (line 102) | static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _M...
function INLINE (line 104) | static INLINE int vtestallones_i_vo32(vopmask g) {
function INLINE (line 108) | static INLINE int vtestallones_i_vo64(vopmask g) {
function INLINE (line 114) | static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
function INLINE (line 115) | static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castp...
function INLINE (line 116) | static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_casts...
function INLINE (line 117) | static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm256_cast...
function INLINE (line 118) | static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm256_cast...
function vint2 (line 122) | static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m25...
function vstoreu_v_p_vi2 (line 123) | static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((...
function vint (line 124) | static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)...
function vstoreu_v_p_vi (line 125) | static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m12...
function INLINE (line 129) | static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpre...
function INLINE (line 130) | static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinter...
function INLINE (line 131) | static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret...
function INLINE (line 132) | static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpre...
function INLINE (line 134) | static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vrein...
function INLINE (line 135) | static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vr...
function INLINE (line 136) | static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreint...
function INLINE (line 137) | static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vrein...
function INLINE (line 139) | static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 140) | static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vrei...
function INLINE (line 141) | static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinter...
function INLINE (line 142) | static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 144) | static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 145) | static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vrei...
function INLINE (line 146) | static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinter...
function INLINE (line 147) | static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinte...
function INLINE (line 149) | static INLINE vopmask vcast_vo32_vo64(vopmask o) {
function INLINE (line 153) | static INLINE vopmask vcast_vo64_vo32(vopmask o) {
function INLINE (line 159) | static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(v...
function INLINE (line 160) | static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_ep...
function INLINE (line 161) | static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(v...
function INLINE (line 162) | static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd,...
function INLINE (line 163) | static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_...
function INLINE (line 164) | static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps...
function INLINE (line 165) | static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(v...
function INLINE (line 166) | static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
function INLINE (line 168) | static INLINE vint2 vcastu_vi2_vi(vint vi) {
function INLINE (line 172) | static INLINE vint vcastu_vi_vi2(vint2 vi) {
function INLINE (line 177) | static INLINE vmask vcast_vm_i_i(int i0, int i1) {
function INLINE (line 181) | static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_c...
function INLINE (line 182) | static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_ad...
function INLINE (line 186) | static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 187) | static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 188) | static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 189) | static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 190) | static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm25...
function INLINE (line 191) | static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
function INLINE (line 192) | static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_m...
function INLINE (line 193) | static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm25...
function INLINE (line 194) | static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 195) | static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 196) | static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 197) | static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 198) | static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm25...
function INLINE (line 200) | static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 201) | static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 202) | static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 203) | static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 204) | static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 206) | static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 207) | static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vrein...
function INLINE (line 208) | static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 209) | static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 210) | static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 211) | static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreint...
function INLINE (line 215) | static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(...
function INLINE (line 216) | static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(...
function INLINE (line 217) | static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(...
function INLINE (line 219) | static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(...
function INLINE (line 220) | static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_...
function INLINE (line 221) | static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x,...
function INLINE (line 222) | static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(...
function INLINE (line 224) | static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andn...
function INLINE (line 225) | static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si1...
function INLINE (line 227) | static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x...
function INLINE (line 228) | static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x...
function INLINE (line 229) | static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x...
function INLINE (line 231) | static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32...
function INLINE (line 232) | static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32...
function INLINE (line 234) | static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_casts...
function INLINE (line 235) | static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_casts...
function INLINE (line 237) | static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return ...
function INLINE (line 239) | static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) ...
function INLINE (line 240) | static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { ...
function INLINE (line 242) | static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, v...
function INLINE (line 250) | static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double...
function INLINE (line 254) | static INLINE vopmask visinf_vo_vd(vdouble d) {
function INLINE (line 258) | static INLINE vopmask vispinf_vo_vd(vdouble d) {
function INLINE (line 262) | static INLINE vopmask visminf_vo_vd(vdouble d) {
function INLINE (line 266) | static INLINE vopmask visnan_vo_vd(vdouble d) {
function INLINE (line 272) | static INLINE double vcast_d_vd(vdouble v) {
function INLINE (line 279) | static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load...
function INLINE (line 280) | static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loa...
function INLINE (line 282) | static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_...
function INLINE (line 283) | static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_store...
function INLINE (line 285) | static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { retu...
function INLINE (line 289) | static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
function INLINE (line 290) | static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
function INLINE (line 292) | static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256...
function INLINE (line 293) | static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_m...
function INLINE (line 294) | static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(...
function INLINE (line 295) | static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
function INLINE (line 296) | static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
function INLINE (line 297) | static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps...
function INLINE (line 298) | static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi...
function INLINE (line 300) | static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret...
function INLINE (line 301) | static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm...
function INLINE (line 303) | static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_a...
function INLINE (line 304) | static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_s...
function INLINE (line 305) | static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_m...
function INLINE (line 306) | static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_d...
function INLINE (line 307) | static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_v...
function INLINE (line 308) | static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
function INLINE (line 309) | static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(va...
function INLINE (line 310) | static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vx...
function INLINE (line 311) | static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 312) | static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 313) | static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 314) | static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_m...
function INLINE (line 315) | static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_m...
function INLINE (line 317) | static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 318) | static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 319) | static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 320) | static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 321) | static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 323) | static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 324) | static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinte...
function INLINE (line 325) | static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 326) | static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 327) | static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 328) | static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinter...
function INLINE (line 330) | static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_a...
function INLINE (line 331) | static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_s...
function INLINE (line 332) | static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcas...
function INLINE (line 334) | static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_a...
function INLINE (line 335) | static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm25...
function INLINE (line 336) | static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or...
function INLINE (line 337) | static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_x...
function INLINE (line 339) | static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi...
function INLINE (line 340) | static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vand...
function INLINE (line 342) | static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_...
function INLINE (line 343) | static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_...
function INLINE (line 344) | static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_...
function INLINE (line 346) | static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_c...
function INLINE (line 347) | static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_c...
function INLINE (line 348) | static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cm...
function INLINE (line 349) | static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cm...
function INLINE (line 351) | static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
function INLINE (line 355) | static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { r...
function vfloat (line 359) | vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
function INLINE (line 363) | static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d...
function INLINE (line 367) | static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vo...
function INLINE (line 371) | static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_...
function INLINE (line 372) | static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, v...
function INLINE (line 373) | static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, v...
function INLINE (line 374) | static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d...
function INLINE (line 378) | static INLINE float vcast_f_vf(vfloat v) {
function INLINE (line 385) | static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_p...
function INLINE (line 386) | static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu...
function INLINE (line 388) | static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps...
function INLINE (line 389) | static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_...
function INLINE (line 391) | static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { ret...
function INLINE (line 400) | static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_...
function INLINE (line 401) | static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_...
function INLINE (line 402) | static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm...
function INLINE (line 403) | static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm...
function INLINE (line 405) | static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _m...
function INLINE (line 406) | static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm25...
function INLINE (line 408) | static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdoubl...
function INLINE (line 409) | static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)...
function INLINE (line 411) | static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_...
function INLINE (line 412) | static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f12...
function INLINE (line 414) | static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_strea...
function INLINE (line 415) | static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int ste...
function INLINE (line 420) | static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int st...
function INLINE (line 427) | static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(...
function INLINE (line 428) | static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_...
function INLINE (line 429) | static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf...
function INLINE (line 431) | static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_...
function INLINE (line 433) | static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step...
function INLINE (line 440) | static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int ste...
function INLINE (line 444) | static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
function INLINE (line 448) | static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
function INLINE (line 452) | static INLINE vint vuninterleave_vi_vi(vint v) {
function INLINE (line 456) | static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
function INLINE (line 460) | static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
function INLINE (line 464) | static INLINE vmask vinterleave_vm_vm(vmask vm) {
function INLINE (line 468) | static INLINE vmask vuninterleave_vm_vm(vmask vm) {
function vmask2 (line 472) | static vmask2 vloadu_vm2_p(void *p) {
type Sleef_quad4 (line 479) | typedef Sleef_quad4 vargquad;
function INLINE (line 481) | static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
function INLINE (line 485) | static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
function INLINE (line 493) | static INLINE int vtestallzeros_i_vo64(vopmask g) {
function INLINE (line 497) | static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { re...
function INLINE (line 499) | static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_su...
function INLINE (line 500) | static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcas...
function INLINE (line 501) | static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_c...
function INLINE (line 508) | static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(...
function INLINE (line 509) | static INLINE vint vcast_vi_vm(vmask vm) {
FILE: src/helperavx512f.h
type __m512i (line 53) | typedef __m512i vmask;
type __mmask16 (line 54) | typedef __mmask16 vopmask;
type __m512d (line 56) | typedef __m512d vdouble;
type __m256i (line 57) | typedef __m256i vint;
type __m512 (line 59) | typedef __m512 vfloat;
type __m512i (line 60) | typedef __m512i vint2;
type vmask2 (line 62) | typedef struct {
function Sleef_x86CpuID (line 71) | static inline
function INLINE (line 82) | static INLINE int cpuSupportsAVX512F() {
function INLINE (line 89) | static INLINE int vavailability_i(int name) {
function INLINE (line 98) | static INLINE int vavailability_i(int name) {
function INLINE (line 108) | static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _M...
function INLINE (line 111) | static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2in...
function INLINE (line 112) | static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2in...
function INLINE (line 114) | static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
function INLINE (line 115) | static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
function vint2 (line 120) | static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m51...
function vstoreu_v_p_vi2 (line 121) | static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((...
function vint (line 122) | static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i...
function vstoreu_v_p_vi (line 123) | static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__...
function INLINE (line 127) | static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_...
function INLINE (line 128) | static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_a...
function INLINE (line 129) | static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si...
function INLINE (line 130) | static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_...
function INLINE (line 132) | static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm51...
function INLINE (line 133) | static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _m...
function INLINE (line 134) | static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512...
function INLINE (line 135) | static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm51...
function INLINE (line 137) | static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_...
function INLINE (line 138) | static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm5...
function INLINE (line 139) | static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_m...
function INLINE (line 141) | static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_...
function INLINE (line 142) | static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm5...
function INLINE (line 143) | static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_m...
function INLINE (line 145) | static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
function INLINE (line 146) | static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }
function INLINE (line 150) | static INLINE vint vrint_vi_vd(vdouble vd) {
function INLINE (line 154) | static INLINE vint vtruncate_vi_vd(vdouble vd) {
function INLINE (line 158) | static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(v...
function INLINE (line 159) | static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
function INLINE (line 161) | static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
function INLINE (line 165) | static INLINE vdouble vrint_vd_vd(vdouble vd) {
function INLINE (line 169) | static INLINE vint2 vcastu_vi2_vi(vint vi) {
function INLINE (line 173) | static INLINE vint vcastu_vi_vi2(vint2 vi) {
function INLINE (line 177) | static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi...
function INLINE (line 179) | static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_c...
function INLINE (line 180) | static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_ad...
function INLINE (line 184) | static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
function INLINE (line 185) | static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castp...
function INLINE (line 186) | static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_casts...
function INLINE (line 187) | static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm512_cast...
function INLINE (line 188) | static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm512_cast...
function INLINE (line 190) | static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 191) | static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 192) | static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 193) | static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 194) | static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm51...
function INLINE (line 195) | static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
function INLINE (line 196) | static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(...
function INLINE (line 197) | static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(...
function INLINE (line 198) | static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 199) | static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 202) | static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 203) | static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 204) | static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 206) | static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 207) | static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 210) | static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) ...
function INLINE (line 211) | static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 212) | static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 213) | static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 214) | static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z...
function INLINE (line 216) | static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512...
function INLINE (line 217) | static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm51...
function INLINE (line 218) | static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512...
function INLINE (line 219) | static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512...
function INLINE (line 220) | static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512...
function INLINE (line 221) | static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512...
function INLINE (line 225) | static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi...
function INLINE (line 226) | static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi...
function INLINE (line 227) | static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(...
function INLINE (line 229) | static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si2...
function INLINE (line 230) | static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andn...
function INLINE (line 232) | static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
function INLINE (line 235) | static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
function INLINE (line 239) | static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256...
function INLINE (line 240) | static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si2...
function INLINE (line 248) | static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_ep...
function INLINE (line 249) | static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_ep...
function INLINE (line 251) | static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
function INLINE (line 254) | static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
function INLINE (line 258) | static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble ...
function vdouble (line 262) | vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
function INLINE (line 268) | static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, v...
function INLINE (line 276) | static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double...
function INLINE (line 280) | static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double...
function INLINE (line 284) | static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, v...
function INLINE (line 289) | static INLINE vopmask visinf_vo_vd(vdouble d) {
function INLINE (line 293) | static INLINE vopmask vispinf_vo_vd(vdouble d) {
function INLINE (line 297) | static INLINE vopmask visminf_vo_vd(vdouble d) {
function INLINE (line 301) | static INLINE vopmask visnan_vo_vd(vdouble d) {
function INLINE (line 305) | static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_...
function INLINE (line 309) | static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512...
function INLINE (line 311) | static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd...
function INLINE (line 312) | static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d...
function INLINE (line 314) | static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_...
function INLINE (line 315) | static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps...
function INLINE (line 324) | static INLINE double vcast_d_vd(vdouble v) {
function INLINE (line 331) | static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load...
function INLINE (line 332) | static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loa...
function INLINE (line 334) | static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_...
function INLINE (line 335) | static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_store...
function INLINE (line 337) | static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { retu...
function INLINE (line 341) | static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
function INLINE (line 347) | static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps...
function INLINE (line 348) | static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi...
function INLINE (line 349) | static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_casts...
function INLINE (line 350) | static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castp...
function INLINE (line 352) | static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_cast...
function INLINE (line 353) | static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_cast...
function INLINE (line 355) | static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
function INLINE (line 356) | static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
function INLINE (line 358) | static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(...
function INLINE (line 359) | static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
function INLINE (line 360) | static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
function INLINE (line 361) | static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512...
function INLINE (line 362) | static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_m...
function INLINE (line 364) | static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
function INLINE (line 368) | static INLINE vfloat vrint_vf_vf(vfloat vd) {
function INLINE (line 372) | static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_a...
function INLINE (line 373) | static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_s...
function INLINE (line 374) | static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_m...
function INLINE (line 375) | static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_d...
function INLINE (line 376) | static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_v...
function INLINE (line 377) | static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
function INLINE (line 378) | static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(va...
function INLINE (line 379) | static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vx...
function INLINE (line 380) | static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_m...
function INLINE (line 381) | static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_m...
function INLINE (line 384) | static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 385) | static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 386) | static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 388) | static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 389) | static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 390) | static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 393) | static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { re...
function INLINE (line 394) | static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 395) | static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 396) | static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 397) | static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { ...
function INLINE (line 399) | static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_c...
function INLINE (line 400) | static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_...
function INLINE (line 401) | static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_c...
function INLINE (line 402) | static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_c...
function INLINE (line 403) | static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_c...
function INLINE (line 404) | static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_c...
function INLINE (line 406) | static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_a...
function INLINE (line 407) | static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_s...
function INLINE (line 408) | static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcas...
function INLINE (line 409) | static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_a...
function INLINE (line 410) | static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm51...
function INLINE (line 411) | static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or...
function INLINE (line 412) | static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_x...
function INLINE (line 414) | static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
function INLINE (line 418) | static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
function INLINE (line 428) | static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_c...
function INLINE (line 429) | static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_c...
function INLINE (line 431) | static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 435) | static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
function INLINE (line 440) | static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
function INLINE (line 444) | static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
function vfloat (line 450) | vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
function INLINE (line 454) | static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d...
function INLINE (line 458) | static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vo...
function INLINE (line 462) | static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_...
function INLINE (line 463) | static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, v...
function INLINE (line 464) | static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, v...
function INLINE (line 465) | static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d...
function INLINE (line 467) | static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm51...
function INLINE (line 468) | static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm5...
function INLINE (line 472) | static INLINE float vcast_f_vf(vfloat v) {
function INLINE (line 479) | static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_p...
function INLINE (line 480) | static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu...
function INLINE (line 482) | static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps...
function INLINE (line 483) | static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_...
function INLINE (line 485) | static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { ret...
function INLINE (line 489) | static INLINE vdouble vposneg_vd_vd(vdouble d) {
function INLINE (line 492) | static INLINE vdouble vnegpos_vd_vd(vdouble d) {
function INLINE (line 495) | static INLINE vfloat vposneg_vf_vf(vfloat d) {
function INLINE (line 498) | static INLINE vfloat vnegpos_vf_vf(vfloat d) {
function INLINE (line 502) | static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return va...
function INLINE (line 503) | static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_...
function INLINE (line 505) | static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdoubl...
function INLINE (line 506) | static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)...
function INLINE (line 508) | static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_p...
function INLINE (line 510) | static INLINE vdouble vreva2_vd_vd(vdouble vd) {
function INLINE (line 514) | static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_strea...
function INLINE (line 516) | static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int ste...
function INLINE (line 523) | static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int st...
function INLINE (line 532) | static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(...
function INLINE (line 533) | static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf...
function INLINE (line 535) | static INLINE vfloat vreva2_vf_vf(vfloat vf) {
function INLINE (line 539) | static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_...
function INLINE (line 541) | static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step...
function INLINE (line 552) | static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int ste...
function INLINE (line 556) | static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {
function INLINE (line 560) | static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {
function INLINE (line 564) | static INLINE vint vuninterleave_vi_vi(vint v) {
function INLINE (line 568) | static INLINE vdouble vinterleave_vd_vd(vdouble vd) {
function INLINE (line 572) | static INLINE vdouble vuninterleave_vd_vd(vdouble vd) {
function INLINE (line 576) | static INLINE vmask vinterleave_vm_vm(vmask vm) {
function INLINE (line 580) | static INLINE vmask vuninterleave_vm_vm(vmask vm) {
function vmask2 (line 584) | static vmask2 vloadu_vm2_p(void *p) {
type Sleef_quad8 (line 591) | typedef Sleef_quad8 vargquad;
function INLINE (line 593) | static INLINE vmask2 vcast_vm2_aq(vargquad aq) {
function INLINE (line 597) | static INLINE vargquad vcast_aq_vm2(vmask2 vm2) {
function INLINE (line 606) | static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2i...
function INLINE (line 608) | static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }
function INLINE (line 611) | static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { re...
function INLINE (line 613) | static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_su...
function INLINE (line 614) | static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcas...
function INLINE (line 615) | static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_c...
function INLINE (line 622) | static INLINE vmask vcast_vm_vi(vint vi) {
function INLINE (line 625) | static INLINE vint vcast_vi_vm(vmask vm) {
FILE: src/helperneon32.h
type uint32x4_t (line 38) | typedef uint32x4_t vmask;
type uint32x4_t (line 39) | typedef uint32x4_t vopmask;
type float32x4_t (line 43) | typedef float32x4_t vfloat;
type int32x4_t (line 44) | typedef int32x4_t vint2;
function INLINE (line 48) | static INLINE void vprefetch_v_p(const void *ptr) { }
function INLINE (line 50) | static INLINE int vtestallones_i_vo32(vopmask g) {
function vfloat (line 56) | static vfloat vloaduf(float *p) { return vld1q_f32(p); }
function vstoreuf (line 57) | static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
function vint2 (line 59) | static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
function vstoreu_v_p_vi2 (line 60) | static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
function INLINE (line 64) | static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x...
function INLINE (line 65) | static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u3...
function INLINE (line 66) | static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x,...
function INLINE (line 67) | static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x...
function INLINE (line 69) | static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq...
function INLINE (line 70) | static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vb...
function INLINE (line 71) | static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_...
function INLINE (line 72) | static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq...
function INLINE (line 74) | static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u...
function INLINE (line 75) | static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbic...
function INLINE (line 76) | static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u3...
function INLINE (line 77) | static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u...
function INLINE (line 79) | static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u...
function INLINE (line 80) | static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbic...
function INLINE (line 81) | static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u3...
function INLINE (line 82) | static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u...
function INLINE (line 84) | static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m...
function INLINE (line 85) | static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m...
function INLINE (line 89) | static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n...
function INLINE (line 90) | static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
function INLINE (line 97) | static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
function INLINE (line 98) | static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
function INLINE (line 99) | static INLINE vint2 vrint_vi2_vf(vfloat d) {
function INLINE (line 102) | static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(v...
function INLINE (line 103) | static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
function INLINE (line 105) | static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vt...
function INLINE (line 106) | static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_...
Condensed preview — 148 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (2,925K chars).
[
{
"path": ".clang-format",
"chars": 32,
"preview": "Standard: Cpp03\nColumnLimit: 79\n"
},
{
"path": ".gitignore",
"chars": 1053,
"preview": "# Common build dirs\nbuild*/\n\n# Dependencies\nnstools/\n\n# Binaries\n*.o\n*.so\n*.pyc\n*.exe\n*.dll\n*.dylib\n\n# Generated files\n#"
},
{
"path": "CMakeLists.txt",
"chars": 14693,
"preview": "# MIT License\n#\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtai"
},
{
"path": "CONTRIBUTING.md",
"chars": 36874,
"preview": "<!--\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "LICENSE",
"chars": 1057,
"preview": "Copyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this s"
},
{
"path": "README.md",
"chars": 23245,
"preview": "Documentation can be found [here](https://agenium-scale.github.io/nsimd/).\nWe put a lot of effort into\n[testing](https:/"
},
{
"path": "benches/benches.hpp",
"chars": 1430,
"preview": "#ifndef BENCHES_HPP\n#define BENCHES_HPP\n\n#include <limits>\n#include <cmath>\n#include <climits>\n\nnamespace nsimd {\nnamesp"
},
{
"path": "build.nsconfig",
"chars": 18295,
"preview": "# MIT License\n#\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtai"
},
{
"path": "doc/Makefile.nix",
"chars": 1710,
"preview": "# Copyright (c) 2020 Agenium Scale\n# \n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# o"
},
{
"path": "doc/Makefile.win",
"chars": 1855,
"preview": "# Copyright (c) 2020 Agenium Scale\n# \n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# o"
},
{
"path": "doc/markdown/compilers_and_versions.md",
"chars": 2446,
"preview": "<!--\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "doc/markdown/concepts.md",
"chars": 3723,
"preview": "# C++20 concepts\n\nAs of C++20, concepts are available. We quote <en.cppreference.com> to\nintroduce concepts.\n\n*Class tem"
},
{
"path": "doc/markdown/defines.md",
"chars": 7709,
"preview": "# Defines provided by NSIMD\n\nNSIMD uses macros (not function macros) that we call defines to make choices\nin its code at"
},
{
"path": "doc/markdown/faq.md",
"chars": 8459,
"preview": "<!--\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "doc/markdown/fp16.md",
"chars": 4344,
"preview": "# IEEE float16 related functions\n\nNSIMD natively supports IEEE float16's. This means that NSIMD provides types\nand funct"
},
{
"path": "doc/markdown/how_tests_are_done.md",
"chars": 19560,
"preview": "<!--\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "doc/markdown/memory.md",
"chars": 2837,
"preview": "# Memory functions\n\nAlthough the purpose of NSIMD is not to provide a full memory container\nlibrary, it provides some he"
},
{
"path": "doc/markdown/modules/.gitignore",
"chars": 9,
"preview": "*/api*.md"
},
{
"path": "doc/markdown/modules/fixed_point/overview.md",
"chars": 4393,
"preview": "<!--\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "doc/markdown/pack.md",
"chars": 8723,
"preview": "# NSIMD pack and related functions\n\nThe advanced C++ API provides types that represents SIMD registers. These\ntypes are "
},
{
"path": "doc/markdown/tutorial.md",
"chars": 10455,
"preview": "<!--\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof "
},
{
"path": "doc/md2html.cpp",
"chars": 5439,
"preview": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "doc/what_is_wrapped.cpp",
"chars": 12073,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "egg/__init__.py",
"chars": 1117,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/common.py",
"chars": 18713,
"preview": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2020 Agenium Scale\n#\n# permission is hereby granted, free "
},
{
"path": "egg/cuda.py",
"chars": 14205,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/experiments/gen_sleef_operators.py",
"chars": 4759,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/experiments/round-ppc.c",
"chars": 672,
"preview": "#include <altivec.h>\n#include <stdio.h>\n\nvoid pp(const char *prefix, FILE *out, float buf[4]) {\n fputs(prefix, out);\n "
},
{
"path": "egg/experiments/upcvt-sve.c",
"chars": 1294,
"preview": "#include <stdio.h>\n#include <arm_sve.h>\n\n// armclang -march=armv8+sve egg/experiments/upcvt-sve.c -o ../build/a.out\n\n// "
},
{
"path": "egg/gen_adv_c_api.py",
"chars": 13692,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_adv_cxx_api.py",
"chars": 8407,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_archis.py",
"chars": 16234,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_base_apis.py",
"chars": 4609,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_benches.py",
"chars": 41279,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_doc.py",
"chars": 29913,
"preview": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free "
},
{
"path": "egg/gen_friendly_but_not_optimized.py",
"chars": 4194,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_modules.py",
"chars": 1249,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_scalar_utilities.py",
"chars": 9440,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_src.py",
"chars": 4620,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/gen_tests.py",
"chars": 131570,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/get_sleef_code.py",
"chars": 9857,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/hatch.py",
"chars": 7221,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/modules/fixed_point/gen_doc.py",
"chars": 7322,
"preview": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free "
},
{
"path": "egg/modules/fixed_point/gen_tests.py",
"chars": 23387,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/modules/fixed_point/hatch.py",
"chars": 2560,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/modules/memory_management/hatch.py",
"chars": 5722,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/modules/random/hatch.py",
"chars": 24441,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/modules/spmd/hatch.py",
"chars": 38132,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/modules/tet1d/hatch.py",
"chars": 35796,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/oneapi.py",
"chars": 15389,
"preview": "\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# o"
},
{
"path": "egg/operators.py",
"chars": 64323,
"preview": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free "
},
{
"path": "egg/platform_arm.py",
"chars": 120425,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/platform_cpu.py",
"chars": 30805,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/platform_ppc.py",
"chars": 92284,
"preview": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/platform_x86.py",
"chars": 184139,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/rocm.py",
"chars": 1270,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/scalar.py",
"chars": 20640,
"preview": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "egg/x86_load_store_deg234.py",
"chars": 163351,
"preview": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of"
},
{
"path": "examples/module_fixed_point.cpp",
"chars": 2423,
"preview": "// Copyright (c) 2019 Agenium Scale\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n/"
},
{
"path": "examples/tutorial.cpp",
"chars": 1600,
"preview": "#include <nsimd/nsimd-all.hpp>\n\n#include <string>\n#include <vector>\n#include <iostream>\n\ntemplate <typename T>\nvoid uppe"
},
{
"path": "include/nsimd/c_adv_api.h",
"chars": 2858,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/cxx_adv_api.hpp",
"chars": 47051,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/cxx_adv_api_aliases.hpp",
"chars": 2137,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/modules/fixed_point.hpp",
"chars": 10680,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/modules/memory_management.hpp",
"chars": 11776,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/modules/spmd.hpp",
"chars": 29649,
"preview": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/modules/tet1d.hpp",
"chars": 15426,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/nsimd-all.h",
"chars": 1169,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/nsimd-all.hpp",
"chars": 1266,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "include/nsimd/nsimd.h",
"chars": 63904,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "scripts/FindNSIMD.cmake",
"chars": 3900,
"preview": "# MIT License\n#\n# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtai"
},
{
"path": "scripts/aarch64-linux-gnu-clang++.sh",
"chars": 53,
"preview": "#!/bin/bash\n\nclang++ --target=aarch64-linux-gnu \"$@\"\n"
},
{
"path": "scripts/aarch64-linux-gnu-clang.sh",
"chars": 51,
"preview": "#!/bin/bash\n\nclang --target=aarch64-linux-gnu \"$@\"\n"
},
{
"path": "scripts/build-tests.bat",
"chars": 2975,
"preview": "@echo off\n\nREM Copyright (c) 2020 Agenium Scale\nREM\nREM Permission is hereby granted, free of charge, to any person obta"
},
{
"path": "scripts/build-tests.sh",
"chars": 2639,
"preview": "#!/bin/bash\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining"
},
{
"path": "scripts/build.bat",
"chars": 3098,
"preview": "@echo off\n\nREM Copyright (c) 2020 Agenium Scale\nREM\nREM Permission is hereby granted, free of charge, to any person obta"
},
{
"path": "scripts/build.sh",
"chars": 2985,
"preview": "#!/bin/bash\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining"
},
{
"path": "scripts/ci-clang.txt",
"chars": 1217,
"preview": "camelot.numscale.com (sse2-sse42-clang)\n- bash scripts/build-tests.sh for sse2/sse42 with clang\n- cd build-sse2-clang\n- "
},
{
"path": "scripts/ci-scale.txt",
"chars": 5806,
"preview": "camelot.hpc.scale <sse2-sse42-gcc> {/home/gquintin}\n- mkdir cmake-build-sse2\n- cd cmake-build-sse2\n- cmake .. -Dsimd=sse"
},
{
"path": "scripts/ci-test.txt",
"chars": 357,
"preview": "couillere <aarch64-macos> {/Users/gquintin}\n- export PATH=${PATH}:/opt/homebrew/bin\n- python3 egg/hatch.py -ltf\n- bash s"
},
{
"path": "scripts/ci.sh",
"chars": 12266,
"preview": "#!/bin/sh\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a"
},
{
"path": "scripts/compile-gmp-mpfr-for-wasm.sh",
"chars": 2899,
"preview": "#!/bin/sh\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a"
},
{
"path": "scripts/gen_github_doc.sh",
"chars": 2222,
"preview": "#!/bin/sh\n# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a"
},
{
"path": "scripts/hipcc.sh",
"chars": 87,
"preview": "#!/bin/bash\n\n/opt/rocm/bin/hipcc -D__HIPCC__ -D__hcc_major__=3 -D__hcc_minor__=10 \"$@\"\n"
},
{
"path": "scripts/init-benches-deps.sh",
"chars": 956,
"preview": "#!/bin/sh\n\n## The top-level dir\nROOT_DIR=\"$( git rev-parse --show-toplevel )\"\n\n## Where all the deps are gonna be instal"
},
{
"path": "scripts/local-ci-rerun.ini",
"chars": 1199,
"preview": "# -----------------------------------------------------------------------------\n# Intel CPU/SIMD\n\n[sse2,sse42,avx,avx2]\n"
},
{
"path": "scripts/local-ci.ini",
"chars": 3271,
"preview": "# -----------------------------------------------------------------------------\n# Intel CPU/SIMD\n\n[sse2,sse42,avx,avx2]\n"
},
{
"path": "scripts/local-ci.sh",
"chars": 2347,
"preview": "#!/bin/sh\n\n# -----------------------------------------------------------------------------\n# Init\n\nINPUT=\"`realpath ${1}"
},
{
"path": "scripts/one-liner.c",
"chars": 3893,
"preview": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "scripts/powerpc64le-linux-gnu-clang++.sh",
"chars": 132,
"preview": "#!/bin/bash\n\nclang++ --target=powerpc64le-linux-gnu \\\n -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-lin"
},
{
"path": "scripts/powerpc64le-linux-gnu-clang.sh",
"chars": 128,
"preview": "#!/bin/bash\n\nclang --target=powerpc64le-linux-gnu \\\n -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-g"
},
{
"path": "scripts/setup.bat",
"chars": 2578,
"preview": "@echo off\n\nREM Copyright (c) 2020 Agenium Scale\nREM\nREM Permission is hereby granted, free of charge, to any person obta"
},
{
"path": "scripts/setup.sh",
"chars": 1986,
"preview": "#!/bin/bash\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining"
},
{
"path": "src/dd.h",
"chars": 14758,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/df.h",
"chars": 16123,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/estrin.h",
"chars": 3013,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/fp16.cpp",
"chars": 8167,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "src/gpu.cpp",
"chars": 2932,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "src/helperadvsimd.h",
"chars": 33091,
"preview": "/*********************************************************************/\n/* Copyright ARM Ltd. 2010 - 2019. "
},
{
"path": "src/helperavx.h",
"chars": 31865,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/helperavx2.h",
"chars": 26990,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/helperavx512f.h",
"chars": 30277,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/helperneon32.h",
"chars": 13567,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/helperpower_128.h",
"chars": 26365,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/helpersse2.h",
"chars": 24879,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/helpersve.h",
"chars": 39205,
"preview": "/*********************************************************************/\n/* Copyright ARM Ltd. 2010 - 2019. "
},
{
"path": "src/memory.cpp",
"chars": 1929,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "src/misc.h",
"chars": 9250,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/rempitab.c",
"chars": 117932,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/rename.h",
"chars": 15802,
"preview": "#ifndef RENAMESCALAR_H\n #define RENAMESCALAR_H\n\n /* ----------------------------------------"
},
{
"path": "src/renameadvsimd.h",
"chars": 16145,
"preview": "#ifndef RENAMEADVSIMD_H\n #define RENAMEADVSIMD_H\n\n /* --------------------------------------"
},
{
"path": "src/renameavx.h",
"chars": 14873,
"preview": "#ifndef RENAMEAVX_H\n #define RENAMEAVX_H\n\n /* ----------------------------------------------"
},
{
"path": "src/renameavx2.h",
"chars": 15191,
"preview": "#ifndef RENAMEAVX2_H\n #define RENAMEAVX2_H\n\n /* --------------------------------------------"
},
{
"path": "src/renameavx512f.h",
"chars": 35361,
"preview": "#ifndef RENAMEAVX512F_H\n #define RENAMEAVX512F_H\n\n /* --------------------------------------"
},
{
"path": "src/renameneon32.h",
"chars": 16143,
"preview": "#ifndef RENAMENEON32_H\n #define RENAMENEON32_H\n\n /* ----------------------------------------"
},
{
"path": "src/renamesse2.h",
"chars": 15191,
"preview": "#ifndef RENAMESSE2_H\n #define RENAMESSE2_H\n\n /* --------------------------------------------"
},
{
"path": "src/renamesse4.h",
"chars": 15507,
"preview": "#ifndef RENAMESSE4_H\n #define RENAMESSE4_H\n\n /* --------------------------------------------"
},
{
"path": "src/renamesve.h",
"chars": 79413,
"preview": "#ifndef RENAMESVE_H\n #define RENAMESVE_H\n\n /* ----------------------------------------------"
},
{
"path": "src/renamevsx.h",
"chars": 29665,
"preview": "#ifndef RENAMEVSX_H\n #define RENAMEVSX_H\n\n /* ----------------------------------------------"
},
{
"path": "src/sleefdp.c",
"chars": 80677,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/sleefsimddp.c",
"chars": 173676,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/sleefsimddp_emulation.c",
"chars": 12325,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "src/sleefsimdsp.c",
"chars": 149198,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/sleefsimdsp_emulation.c",
"chars": 18850,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "src/sleefsp.c",
"chars": 64000,
"preview": "// Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0."
},
{
"path": "src/ufp.cpp",
"chars": 2552,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/CMakeLists.txt.sh",
"chars": 2357,
"preview": "# MIT License\n#\n# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtai"
},
{
"path": "tests/FindNSIMD.cmake.sh",
"chars": 5816,
"preview": "#!/bin/bash\n#\n# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaini"
},
{
"path": "tests/allocator.cpp",
"chars": 1557,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/assign_arith.cpp",
"chars": 6989,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/booleans.cpp",
"chars": 1344,
"preview": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/c11_vec.c",
"chars": 1386,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/cxx_adv_api_aliases.cpp",
"chars": 2497,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/fp16.prec11.c",
"chars": 5495,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/get_pack.cpp",
"chars": 6350,
"preview": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/memory.cpp",
"chars": 2842,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/memory.prec11.c",
"chars": 1314,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/modules/common.hpp",
"chars": 10228,
"preview": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/nsimd-all.cpp",
"chars": 1784,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/nsimd.cpp",
"chars": 3179,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/nsimd.prec11.c",
"chars": 1123,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/operator_vector_scalar.cpp",
"chars": 1207,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/shifts.cpp",
"chars": 1587,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/templated_loads_stores.cpp",
"chars": 2905,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
},
{
"path": "tests/tests_helpers.hpp",
"chars": 7493,
"preview": "#ifndef TESTS_HELPERS_HPP\n#define TESTS_HELPERS_HPP\n\n#include <cerrno>\n#include <cstdio>\n#include <cstdlib>\n#include <cs"
},
{
"path": "tests/to_pack.cpp",
"chars": 7701,
"preview": "#define STATUS \"test of to_pack over all types\"\n\n#include \"tests_helpers.hpp\"\n\ntemplate <typename T> bool to_pack_from_p"
},
{
"path": "tests/to_pack_interleave.cpp",
"chars": 12947,
"preview": "#define STATUS \"test of to_pack_interleave over all types\"\n\n#include \"tests_helpers.hpp\"\n\ntemplate <typename T> bool to_"
},
{
"path": "tests/ufp.cpp",
"chars": 3354,
"preview": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof th"
}
]
About this extraction
This page contains the full source code of the agenium-scale/nsimd GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 148 files (2.7 MB), approximately 715.6k tokens, and a symbol index with 3964 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.